date:20221114

Widen the length field of NBDRequest to 64-bits, although we can
assert that all current uses are still under 32 bits.  Move the
request magic number to nbd.h, to live alongside the reply magic
number.  Add the necessary bools that will eventually track whether
the client successfully negotiated extended headers with the server,
allowing the nbd driver to pass larger requests along where possible;
although in this patch they always remain false for no semantic change
yet.

Signed-off-by: Eric Blake 
---
 include/block/nbd.h | 21 -
 nbd/nbd-internal.h  |  3 +--
 block/nbd.c | 35 ---
 nbd/client.c|  9 ++---
 nbd/server.c| 12 +---
 nbd/trace-events|  8 
 6 files changed, 56 insertions(+), 32 deletions(-)

diff --git a/include/block/nbd.h b/include/block/nbd.h
index 1330dbc18b..e357452a57 100644
--- a/include/block/nbd.h
+++ b/include/block/nbd.h
@@ -52,17 +52,16 @@ typedef struct NBDOptionReplyMetaContext {

 /* Transmission phase structs
  *
- * Note: these are _NOT_ the same as the network representation of an NBD
- * request and reply!
+ * Note: NBDRequest is _NOT_ the same as the network representation of an NBD
+ * request!
  */
-struct NBDRequest {
+typedef struct NBDRequest {
 uint64_t handle;
-uint64_t from;
-uint32_t len;
+uint64_t from;  /* Offset touched by the command */
+uint64_t len;   /* Effect length; 32 bit limit without extended headers */
 uint16_t flags; /* NBD_CMD_FLAG_* */
-uint16_t type; /* NBD_CMD_* */
-};
-typedef struct NBDRequest NBDRequest;
+uint16_t type;  /* NBD_CMD_* */
+} NBDRequest;

 typedef struct NBDSimpleReply {
 uint32_t magic;  /* NBD_SIMPLE_REPLY_MAGIC */
@@ -235,6 +234,9 @@ enum {
  */
 #define NBD_MAX_STRING_SIZE 4096

+/* Transmission request structure */
+#define NBD_REQUEST_MAGIC   0x25609513
+
 /* Two types of reply structures */
 #define NBD_SIMPLE_REPLY_MAGIC  0x67446698
 #define NBD_STRUCTURED_REPLY_MAGIC  0x668e33ef
@@ -293,6 +295,7 @@ struct NBDExportInfo {
 /* In-out fields, set by client before nbd_receive_negotiate() and
  * updated by server results during nbd_receive_negotiate() */
 bool structured_reply;
+bool extended_headers;
 bool base_allocation; /* base:allocation context for NBD_CMD_BLOCK_STATUS 
*/

 /* Set by server results during nbd_receive_negotiate() and
@@ -322,7 +325,7 @@ int nbd_receive_export_list(QIOChannel *ioc, 
QCryptoTLSCreds *tlscreds,
 Error **errp);
 int nbd_init(int fd, QIOChannelSocket *sioc, NBDExportInfo *info,
  Error **errp);
-int nbd_send_request(QIOChannel *ioc, NBDRequest *request);
+int nbd_send_request(QIOChannel *ioc, NBDRequest *request, bool ext_hdr);
 int coroutine_fn nbd_receive_reply(BlockDriverState *bs, QIOChannel *ioc,
NBDReply *reply, Error **errp);
 int nbd_client(int fd);
diff --git a/nbd/nbd-internal.h b/nbd/nbd-internal.h
index 1b2141ab4b..0016793ff4 100644
--- a/nbd/nbd-internal.h
+++ b/nbd/nbd-internal.h
@@ -1,7 +1,7 @@
 /*
  * NBD Internal Declarations
  *
- * Copyright (C) 2016 Red Hat, Inc.
+ * Copyright (C) 2016-2021 Red Hat, Inc.
  *
  * This work is licensed under the terms of the GNU GPL, version 2 or later.
  * See the COPYING file in the top-level directory.
@@ -45,7 +45,6 @@
 #define NBD_OLDSTYLE_NEGOTIATE_SIZE (8 + 8 + 8 + 4 + 124)

 #define NBD_INIT_MAGIC  0x4e42444d41474943LL /* ASCII "NBDMAGIC" */
-#define NBD_REQUEST_MAGIC   0x25609513
 #define NBD_OPTS_MAGIC  0x49484156454F5054LL /* ASCII "IHAVEOPT" */
 #define NBD_CLIENT_MAGIC0x420281861253LL
 #define NBD_REP_MAGIC   0x0003e889045565a9LL
diff --git a/block/nbd.c b/block/nbd.c
index 7d485c86d2..32681d2867 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -2,7 +2,7 @@
  * QEMU Block driver for  NBD
  *
  * Copyright (c) 2019 Virtuozzo International GmbH.
- * Copyright (C) 2016 Red Hat, Inc.
+ * Copyright (C) 2016-2022 Red Hat, Inc.
  * Copyright (C) 2008 Bull S.A.S.
  * Author: Laurent Vivier 
  *
@@ -340,7 +340,7 @@ int coroutine_fn 
nbd_co_do_establish_connection(BlockDriverState *bs,
  */
 NBDRequest request = { .type = NBD_CMD_DISC };

-nbd_send_request(s->ioc, &request);
+nbd_send_request(s->ioc, &request, s->info.extended_headers);

 yank_unregister_function(BLOCKDEV_YANK_INSTANCE(s->bs->node_name),
  nbd_yank, bs);
@@ -524,14 +524,14 @@ static int coroutine_fn 
nbd_co_send_request(BlockDriverState *bs,

 if (qiov) {
 qio_channel_set_cork(s->ioc, true);
-rc = nbd_send_request(s->ioc, request);
+rc = nbd_send_request(s->ioc, request, s->info.extended_headers);
 if (rc >= 0 && qio_channel_writev_all(s->ioc, qiov->iov, qiov->niov,
   NULL) < 0) {
 rc = -EIO;
 }

[libnbd PATCH v2 02/23] internal: Refactor layout of replies in sbuf

In order to more easily add a third reply type with an even larger
header, but where the payload will look the same for both structured
and extended replies, it is nicer if simple and structured replies are
nested inside the same layer of sbuf.reply.hdr.  While at it, note
that while .or and .sr are structs declared within the overall sbuf
union, we never read into both halves of those structs at the same
time, so it does not matter if their two halves are consecutive.
Dropping the packed notation on those structs means the compiler can
align .payload more naturally, which may slightly improve performance
on some platforms, even if it makes the overall union a few bytes
larger due to padding.

Visually, this patch changes the layout from:

 offset  simplestructured
++
| union sbuf |
| +-+--+ |
| | struct simple_reply | union sr | |
| | +-+ | +--+ | |
| | | | | | struct structured_reply  | | |
| | | | | | +--+ | | |
|  0  | | uint32_t magic  | | | | uint32_t magic   | | | |
|  4  | | uint32_t error  | | | | uint16_t flags   | | | |
|  6  | | | | | | uint16_t type| | | |
|  8  | | uint64_t handle | | | | uint64_t handle  | | | |
| | +-+ | | |  | | | |
| 16  | [padding]   | | | uint32_t length  | | | |
| | | | +--+ | | |
| | | | union payload| | |
| | | | +---+--+ | | |
| 20  | | | | ...   | ...  | | | |
| | | | +---+--+ | | |
| | | +--+ | |
| +-+--+ |
++

to:

 offset  simplestructured
+-+
| union sbuf  |
| +-+ |
| | struct reply| |
| | +-+ | |
| | | union hdr   | | |
| | | +++ | | |
| | | | struct simple  | struct structured  | | | |
| | | | ++ | ++ | | | |
|  0  | | | | uint32_t magic | | | uint32_t magic | | | | |
|  4  | | | | uint32_t error | | | uint16_t flags | | | | |
|  6  | | | || | | uint16_t type  | | | | |
|  8  | | | | uint64_t handle| | | uint64_t handle| | | | |
| | | | ++ | || | | | |
| 16  | | | [padding]  | | uint32_t length| | | | |
| | | || ++ | | | |
| 20  | | || [padding]  | | | |
| | | +++ | | |
| | | union payload   | | |
| | | +++ | | |
| 24  | | | ...| ...| | | |
| | | +++ | | |
| | +-+ | |
| +-+ |
+-+

Technically, whether the payload union offset moves to byte 24 (with
20-23 now padding) or stays at 20 depends on compiler ABI; but many
systems prefer that any struct with a uint64_t provide 8-byte
alignment to its containing union.

The commit is largely mechanical, and there should be no semantic
change.
---
 lib/internal.h  |  12 ++--
 generator/states-reply-simple.c |   4 +-
 generator/states-reply-structured.c | 103 ++--
 generator/states-reply.c|  10 +--
 4 files changed, 66 insertions(+), 63 deletions(-)

diff --git a/lib/internal.h b/lib/internal.h
index fe81f1a0..f81c41ba 100644
--- a/lib/internal.h
+++ b/lib/internal.h
@@ -230,14 +230,16 @@ struct nbd_handle {
 struct {
   struct nbd_fixed_new_option_reply_meta_context context;
   char str[NBD_MAX_STRING];
-}  __attribute__((packed)) context;
+} __attribute__((packed)) context;
 char err_msg[NBD_MAX_STRING];
   } payload;
-}  __attribute__((packed)) or;
+} or;
 struct nbd_export_name_option_reply export_name_reply;
-struct nbd_simple_reply simple_reply;
 struct {
-  struct nbd_structured_reply structured_reply;
+  uni

[libnbd PATCH v2 08/23] block_status: Track 64-bit extents internally

When extended headers are in use, the server can send us 64-bit
extents, even for a 32-bit query (if the server knows the entire image
is data, for example, or if the metacontext has a status definition
that uses more than 32 bits).  Also, while most contexts only have
32-bit flags, a server is allowed to negotiate contexts with 64-bit
flags when extended headers are in use.  Thus, for maximum
flexibility, we are better off storing 64-bit data internally, with a
goal of letting the client's 32-bit interface work as much as
possible, and for a future API addition of a 64-bit interface to work
even when the server only gave 32-bit results.

For backwards compatibility, a client that never negotiates a 64-bit
status context can be handled without errors by truncating any 64-bit
lengths down to just under 4G; so the old 32-bit interface will
continue to work in most cases.  But we can't truncate status down; if
a user requests an extended status, the 32-bit interface can now
report EOVERFLOW for that context (although that situation can't
happen until a later patch actually turns on the use of extended
headers).

Note that the existing 32-bit nbd_block_status() API is now slightly
slower, particularly when talking with a server that lacks extended
headers: we are doing double size conversions.  But this speed penalty
is likely in the noise compared to the network delays, and ideally
clients will switch over to the new 64-bit interfaces as more and more
servers start supporting extended headers.

One of the trickier aspects of this patch is auditing that both the
user's extent and our malloc'd shim get cleaned up once on all
possible paths, so that there is neither a leak nor a double free.
---
 lib/internal.h  |  8 +++-
 generator/states-reply-structured.c | 30 -
 lib/handle.c|  2 +-
 lib/rw.c| 70 -
 4 files changed, 95 insertions(+), 15 deletions(-)

diff --git a/lib/internal.h b/lib/internal.h
index 73fd24c0..0c23f882 100644
--- a/lib/internal.h
+++ b/lib/internal.h
@@ -80,7 +80,7 @@ struct export {

 struct command_cb {
   union {
-nbd_extent_callback extent;
+nbd_extent64_callback extent;
 nbd_chunk_callback chunk;
 nbd_list_callback list;
 nbd_context_callback context;
@@ -302,7 +302,11 @@ struct nbd_handle {
   size_t querynum;

   /* When receiving block status, this is used. */
-  uint32_t *bs_entries;
+  union {
+char *storage;  /* malloc's view */
+nbd_extent *normal; /* Our 64-bit preferred internal form; n slots */
+uint32_t *narrow;   /* 32-bit NBD_REPLY_TYPE_BLOCK_STATUS form; n*2 slots 
*/
+  } bs_entries;

   /* Commands which are waiting to be issued [meaning the request
* packet is sent to the server].  This is used as a simple linked
diff --git a/generator/states-reply-structured.c 
b/generator/states-reply-structured.c
index da9894c6..d23e56a9 100644
--- a/generator/states-reply-structured.c
+++ b/generator/states-reply-structured.c
@@ -436,6 +436,7 @@  REPLY.STRUCTURED_REPLY.RECV_OFFSET_HOLE:
  REPLY.STRUCTURED_REPLY.RECV_BS_HEADER:
   struct command *cmd = h->reply_cmd;
   uint32_t length;
+  uint32_t count;

   switch (recv_into_rbuf (h)) {
   case -1: SET_NEXT_STATE (%.DEAD); return 0;
@@ -450,15 +451,19 @@  REPLY.STRUCTURED_REPLY.RECV_BS_HEADER:
 assert (cmd->type == NBD_CMD_BLOCK_STATUS);
 assert (length >= 12);
 length -= sizeof h->sbuf.reply.payload.bs_hdr;
+count = length / (2 * sizeof (uint32_t));

-free (h->bs_entries);
-h->bs_entries = malloc (length);
-if (h->bs_entries == NULL) {
+/* Read raw data into a subset of h->bs_entries, then expand it
+ * into place later during byte-swapping.
+ */
+free (h->bs_entries.storage);
+h->bs_entries.storage = malloc (count * sizeof *h->bs_entries.normal);
+if (h->bs_entries.storage == NULL) {
   SET_NEXT_STATE (%.DEAD);
   set_error (errno, "malloc");
   return 0;
 }
-h->rbuf = h->bs_entries;
+h->rbuf = h->bs_entries.narrow;
 h->rlen = length;
 SET_NEXT_STATE (%RECV_BS_ENTRIES);
   }
@@ -470,6 +475,7 @@  REPLY.STRUCTURED_REPLY.RECV_BS_ENTRIES:
   uint32_t count;
   size_t i;
   uint32_t context_id;
+  uint32_t *raw;

   switch (recv_into_rbuf (h)) {
   case -1: SET_NEXT_STATE (%.DEAD); return 0;
@@ -483,17 +489,21 @@  REPLY.STRUCTURED_REPLY.RECV_BS_ENTRIES:
 assert (cmd); /* guaranteed by CHECK */
 assert (cmd->type == NBD_CMD_BLOCK_STATUS);
 assert (CALLBACK_IS_NOT_NULL (cmd->cb.fn.extent));
-assert (h->bs_entries);
+assert (h->bs_entries.normal);
 assert (length >= 12);
 assert (h->meta_valid);
 count = (length - sizeof h->sbuf.reply.payload.bs_hdr) /
-  sizeof *h->bs_entries;
+  (2 * sizeof (uint32_t));

 /* Need to byte-swap the entries returned, but apart from that we
- * don't validate them.
+ * don't validate them.  Reverse order is essential, since w

[PATCH v2 0/6] NBD spec changes for 64-bit extensions

This is the NBD spec series; there are matching qemu and libnbd
patches that implement the changes in this series.  I'm happy to drop
the RFC patches from all three, but wanted the conversation on whether
it makes sense to have 64-bit holes during NBD_CMD_READ first (it
would make more sense if we had a way for a client and server to agree
on a single-transaction buffer limit much larger than 32M).

Eric Blake (6):
  spec: Recommend cap on NBD_REPLY_TYPE_BLOCK_STATUS length
  spec: Tweak description of maximum block size
  spec: Add NBD_OPT_EXTENDED_HEADERS
  spec: Allow 64-bit block status results
  spec: Introduce NBD_FLAG_BLOCK_STATUS_PAYLOAD
  RFC: spec: Introduce NBD_REPLY_TYPE_OFFSET_HOLE_EXT

 doc/proto.md | 698 ++-
 1 file changed, 521 insertions(+), 177 deletions(-)

-- 
2.38.1

[libnbd PATCH v2 03/23] protocol: Add definitions for extended headers

Add the magic numbers and new structs necessary to implement the NBD
protocol extension of extended headers providing 64-bit lengths.  This
corresponds to upstream nbd commits XXX-XXX[*].

---
[*] FIXME update commit ids before pushing
---
 lib/nbd-protocol.h | 66 --
 1 file changed, 52 insertions(+), 14 deletions(-)

diff --git a/lib/nbd-protocol.h b/lib/nbd-protocol.h
index 4400d3ab..ac569a11 100644
--- a/lib/nbd-protocol.h
+++ b/lib/nbd-protocol.h
@@ -124,6 +124,7 @@ struct nbd_fixed_new_option_reply {
 #define NBD_OPT_STRUCTURED_REPLY   8
 #define NBD_OPT_LIST_META_CONTEXT  9
 #define NBD_OPT_SET_META_CONTEXT   10
+#define NBD_OPT_EXTENDED_HEADERS   11

 #define NBD_REP_ERR(val) (0x8000 | (val))
 #define NBD_REP_IS_ERR(val) (!!((val) & 0x8000))
@@ -141,6 +142,7 @@ struct nbd_fixed_new_option_reply {
 #define NBD_REP_ERR_SHUTDOWN NBD_REP_ERR (7)
 #define NBD_REP_ERR_BLOCK_SIZE_REQD  NBD_REP_ERR (8)
 #define NBD_REP_ERR_TOO_BIG  NBD_REP_ERR (9)
+#define NBD_REP_ERR_EXT_HEADER_REQD  NBD_REP_ERR (10)

 #define NBD_INFO_EXPORT  0
 #define NBD_INFO_NAME1
@@ -182,16 +184,26 @@ struct nbd_fixed_new_option_reply_meta_context {
   /* followed by a string */
 } NBD_ATTRIBUTE_PACKED;

-/* Request (client -> server). */
+/* Compact request (client -> server). */
 struct nbd_request {
   uint32_t magic;   /* NBD_REQUEST_MAGIC. */
-  uint16_t flags;   /* Request flags. */
-  uint16_t type;/* Request type. */
+  uint16_t flags;   /* Request flags: NBD_CMD_FLAG_*. */
+  uint16_t type;/* Request type: NBD_CMD_*. */
   uint64_t handle;  /* Opaque handle. */
   uint64_t offset;  /* Request offset. */
   uint32_t count;   /* Request length. */
 } NBD_ATTRIBUTE_PACKED;

+/* Extended request (client -> server). */
+struct nbd_request_ext {
+  uint32_t magic;   /* NBD_EXTENDED_REQUEST_MAGIC. */
+  uint16_t flags;   /* Request flags: NBD_CMD_FLAG_*. */
+  uint16_t type;/* Request type: NBD_CMD_*. */
+  uint64_t handle;  /* Opaque handle. */
+  uint64_t offset;  /* Request offset. */
+  uint64_t count;   /* Request effect or payload length. */
+} NBD_ATTRIBUTE_PACKED;
+
 /* Simple reply (server -> client). */
 struct nbd_simple_reply {
   uint32_t magic;   /* NBD_SIMPLE_REPLY_MAGIC. */
@@ -208,6 +220,16 @@ struct nbd_structured_reply {
   uint32_t length;  /* Length of payload which follows. */
 } NBD_ATTRIBUTE_PACKED;

+/* Extended reply (server -> client). */
+struct nbd_extended_reply {
+  uint32_t magic;   /* NBD_EXTENDED_REPLY_MAGIC. */
+  uint16_t flags;   /* NBD_REPLY_FLAG_* */
+  uint16_t type;/* NBD_REPLY_TYPE_* */
+  uint64_t handle;  /* Opaque handle. */
+  uint64_t offset;  /* Client's offset. */
+  uint64_t length;  /* Length of payload which follows. */
+} NBD_ATTRIBUTE_PACKED;
+
 struct nbd_structured_reply_offset_data {
   uint64_t offset;  /* offset */
   /* Followed by data. */
@@ -224,11 +246,23 @@ struct nbd_block_descriptor {
   uint32_t status_flags;/* block type (hole etc) */
 } NBD_ATTRIBUTE_PACKED;

+/* NBD_REPLY_TYPE_BLOCK_STATUS_EXT block descriptor. */
+struct nbd_block_descriptor_ext {
+  uint64_t length;  /* length of block */
+  uint64_t status_flags;/* block type (hole etc) */
+} NBD_ATTRIBUTE_PACKED;
+
 struct nbd_structured_reply_block_status_hdr {
   uint32_t context_id;  /* metadata context ID */
   /* followed by array of nbd_block_descriptor extents */
 } NBD_ATTRIBUTE_PACKED;

+struct nbd_structured_reply_block_status_ext_hdr {
+  uint32_t context_id;  /* metadata context ID */
+  uint32_t count;   /* 0, or length of following array */
+  /* followed by array of nbd_block_descriptor_ext extents */
+} NBD_ATTRIBUTE_PACKED;
+
 struct nbd_structured_reply_error {
   uint32_t error;   /* NBD_E* error number */
   uint16_t len; /* Length of human readable error. */
@@ -236,8 +270,10 @@ struct nbd_structured_reply_error {
 } NBD_ATTRIBUTE_PACKED;

 #define NBD_REQUEST_MAGIC   0x25609513
+#define NBD_EXTENDED_REQUEST_MAGIC  0x21e41c71
 #define NBD_SIMPLE_REPLY_MAGIC  0x67446698
 #define NBD_STRUCTURED_REPLY_MAGIC  0x668e33ef
+#define NBD_EXTENDED_REPLY_MAGIC0x6e8a278c

 /* Structured reply flags. */
 #define NBD_REPLY_FLAG_DONE (1<<0)
@@ -246,12 +282,13 @@ struct nbd_structured_reply_error {
 #define NBD_REPLY_TYPE_IS_ERR(val) (!!((val) & (1<<15)))

 /* Structured reply types. */
-#define NBD_REPLY_TYPE_NONE 0
-#define NBD_REPLY_TYPE_OFFSET_DATA  1
-#define NBD_REPLY_TYPE_OFFSET_HOLE  2
-#define NBD_REPLY_TYPE_BLOCK_STATUS 5
-#define NBD_REPLY_TYPE_ERRORNBD_REPLY_TYPE_ERR (1)
-#define NBD_REPLY_TYPE_ERROR_OFFS

[PATCH v2 3/6] spec: Add NBD_OPT_EXTENDED_HEADERS

Add a new negotiation feature where the client and server agree to use
larger packet headers on every packet sent during transmission phase.
This has two purposes: first, it makes it possible to perform
operations like trim, write zeroes, and block status on more than 2^32
bytes in a single command.  For NBD_CMD_READ, replies are still
implicitly capped by the maximum block payload limits (generally 32M);
if you want to know if a hole larger than 32 bits can represent,
you'll use BLOCK_STATUS instead of hoping that a large READ will
either return a hole or report overflow.  But for
NBD_CMD_BLOCK_STATUS, it is very useful to be able to report a status
extent with a size larger than 32-bits, in some cases even if the
client's request was for smaller than 32-bits (such as when it is
known that the entire image is not sparse).  Thus, the wording chosen
here is careful to permit a server to use either flavor status chunk
type in its reply, and clients requesting extended headers must be
prepared for both reply types.

Second, when structured replies are active, clients have to deal with
the difference between 16- and 20-byte headers of simple
vs. structured replies, which impacts performance if the client must
perform multiple syscalls to first read the magic before knowing if
there are even additional bytes to read to learn a payload length.  In
extended header mode, all headers are the same width and there are no
simple replies permitted.  The layout of the reply header is more like
the request header; and including the client's offset in the reply
makes it easier to convert between absolute and relative offsets for
replies to NBD_CMD_READ.  Similarly, by having extended mode use a
power-of-2 sizing, it becomes easier to manipulate arrays of headers
without worrying about an individual header crossing a cache line.
However, note that this change only affects the headers; data payloads
can still be unaligned (for example, a client performing 1-byte reads
or writes).  We would need to negotiate yet another extension if we
wanted to ensure that all NBD transmission packets started on an
8-byte boundary after option haggling has completed.

This spec addition was done in parallel with proof of concept
implementations in qemu (server and client), libnbd (client), and
nbdkit (server).

Signed-off-by: Eric Blake 
---
 doc/proto.md | 481 ++-
 1 file changed, 358 insertions(+), 123 deletions(-)

diff --git a/doc/proto.md b/doc/proto.md
index 53c334a..fde1e70 100644
--- a/doc/proto.md
+++ b/doc/proto.md
@@ -280,34 +280,53 @@ a soft disconnect.

 ### Transmission

-There are three message types in the transmission phase: the request,
-the simple reply, and the structured reply chunk.  The
+There are two general message types in the transmission phase: the
+request (simple or extended), and the reply (simple, structured, or
+extended).  The determination of which message headers to use is
+determined during handshaking phase, based on whether
+`NBD_OPT_STRUCTURED_REPLY` or `NBD_OPT_EXTENDED_HEADERS` was requested
+by the client and given a successful response by the server.  The
 transmission phase consists of a series of transactions, where the
 client submits requests and the server sends corresponding replies
 with either a single simple reply or a series of one or more
-structured reply chunks per request.  The phase continues until
-either side terminates transmission; this can be performed cleanly
-only by the client.
+structured or extended reply chunks per request.  The phase continues
+until either side terminates transmission; this can be performed
+cleanly only by the client.

 Note that without client negotiation, the server MUST use only simple
 replies, and that it is impossible to tell by reading the server
 traffic in isolation whether a data field will be present; the simple
 reply is also problematic for error handling of the `NBD_CMD_READ`
-request.  Therefore, structured replies can be used to create a
-a context-free server stream; see below.
+request.  Therefore, structured or extended replies can be used to
+create a a context-free server stream; see below.
+
+The results of client negotiation also determine whether the client
+and server will utilize only compact requests and replies, or whether
+both sides will use only extended packets.  Compact messages are the
+default, but inherently limit single transactions to a 32-bit window
+starting at a 64-bit offset.  Extended messages make it possible to
+perform 64-bit transactions (although typically only for commands that
+do not include a data payload).  Furthermore, when only structured
+replies have been negotiated, compact messages require the client to
+perform partial reads to determine which reply packet style (16-byte
+simple or 20-byte structured) is on the wire before knowing the length
+of the rest of the reply, which can reduce client performance.  With
+extended messages, all packet heade

[PATCH v2 00/15] qemu patches for 64-bit NBD extensions

This series implements the spec changes in a counterpart NBD series,
and has been tested to be interoperable with libnbd implementing the
same spec.  I'm not too happy with the RFC patch at the end, but
implemented it for discussion.  Given the release timing, this would
be qemu 8.0 material if we are happy with the direction the spec is
headed in.

Eric Blake (15):
  nbd/client: Add safety check on chunk payload length
  nbd/server: Prepare for alternate-size headers
  nbd: Prepare for 64-bit request effect lengths
  nbd: Add types for extended headers
  nbd/server: Refactor handling of request payload
  nbd/server: Refactor to pass full request around
  nbd/server: Initial support for extended headers
  nbd/server: Support 64-bit block status
  nbd/client: Initial support for extended headers
  nbd/client: Accept 64-bit block status chunks
  nbd/client: Request extended headers during negotiation
  nbd/server: Prepare for per-request filtering of BLOCK_STATUS
  nbd/server: Add FLAG_PAYLOAD support to CMD_BLOCK_STATUS
  RFC: nbd/client: Accept 64-bit hole chunks
  RFC: nbd/server: Send 64-bit hole chunk

 docs/interop/nbd.txt  |   1 +
 include/block/nbd.h   | 163 +++--
 nbd/nbd-internal.h|   8 +-
 block/nbd.c   | 102 ++-
 nbd/client-connection.c   |   1 +
 nbd/client.c  | 132 +++-
 nbd/common.c  |  14 +-
 nbd/server.c  | 636 +-
 qemu-nbd.c|   3 +
 block/trace-events|   1 +
 nbd/trace-events  |  11 +-
 tests/qemu-iotests/223.out|  18 +-
 tests/qemu-iotests/233.out|   5 +
 tests/qemu-iotests/241.out|   3 +
 tests/qemu-iotests/307.out|  15 +-
 .../tests/nbd-qemu-allocation.out |   3 +-
 16 files changed, 797 insertions(+), 319 deletions(-)

-- 
2.38.1

[libnbd PATCH v2 09/23] block_status: Accept 64-bit extents during block status

Support a server giving us a 64-bit extent.  Note that the protocol
says a server should not give a 64-bit answer when extended headers
are not negotiated; we can handle that by reporting EPROTO but
otherwise accepting the information.  Meanwhile, when extended headers
are in effect, even a 32-bit original query can produce a 64-bit
answer; and likewise, a 64-bit query may have so much information that
the server truncates it to a 32-bit answer, so we must be prepared for
either type of response.  Since we already store 64-bit extents
internally, the user's 32-bit callback doesn't have to care which
reply chunk the server uses (the shim takes care of that, and an
upcoming patch adds new APIs to let the client use a 64-bit callback).
Of course, until a later patch enables extended headers negotiation,
no compliant server will trigger the new code here.

Implementation-wise, 'normal' and 'wide' are two different types but
have the same underlying size; keeping the two names makes it easier
to reason about when values are still in network byte order from the
server or native endian for local processing.
---
 lib/internal.h  |  3 +
 generator/states-reply-structured.c | 88 ++---
 2 files changed, 72 insertions(+), 19 deletions(-)

diff --git a/lib/internal.h b/lib/internal.h
index 0c23f882..b91fe6f6 100644
--- a/lib/internal.h
+++ b/lib/internal.h
@@ -248,6 +248,7 @@ struct nbd_handle {
 struct nbd_structured_reply_offset_data offset_data;
 struct nbd_structured_reply_offset_hole offset_hole;
 struct nbd_structured_reply_block_status_hdr bs_hdr;
+struct nbd_structured_reply_block_status_ext_hdr bs_ext_hdr;
 struct {
   struct nbd_structured_reply_error error;
   char msg[NBD_MAX_STRING]; /* Common to all error types */
@@ -306,6 +307,8 @@ struct nbd_handle {
 char *storage;  /* malloc's view */
 nbd_extent *normal; /* Our 64-bit preferred internal form; n slots */
 uint32_t *narrow;   /* 32-bit NBD_REPLY_TYPE_BLOCK_STATUS form; n*2 slots 
*/
+struct nbd_block_descriptor_ext *wide;
+/* 64-bit NBD_REPLY_TYPE_BLOCK_STATUS_EXT; n slots */
   } bs_entries;

   /* Commands which are waiting to be issued [meaning the request
diff --git a/generator/states-reply-structured.c 
b/generator/states-reply-structured.c
index d23e56a9..7e313b5a 100644
--- a/generator/states-reply-structured.c
+++ b/generator/states-reply-structured.c
@@ -22,6 +22,8 @@
 #include 
 #include 

+#include "minmax.h"
+
 /* Structured reply must be completely inside the bounds of the
  * requesting command.
  */
@@ -147,6 +149,24 @@  REPLY.STRUCTURED_REPLY.CHECK:
 /* Start by reading the context ID. */
 h->rbuf = &h->sbuf.reply.payload.bs_hdr;
 h->rlen = sizeof h->sbuf.reply.payload.bs_hdr;
+h->sbuf.reply.payload.bs_ext_hdr.count = 0;
+SET_NEXT_STATE (%RECV_BS_HEADER);
+break;
+
+  case NBD_REPLY_TYPE_BLOCK_STATUS_EXT:
+if (cmd->type != NBD_CMD_BLOCK_STATUS ||
+length < 24 ||
+(length-8) % sizeof(struct nbd_block_descriptor_ext))
+  goto resync;
+if (!h->extended_headers) {
+  debug (h, "unexpected 64-bit block status without extended headers, "
+ "this is probably a server bug");
+  if (cmd->error == 0)
+cmd->error = EPROTO;
+}
+/* Start by reading the context ID. */
+h->rbuf = &h->sbuf.reply.payload.bs_ext_hdr;
+h->rlen = sizeof h->sbuf.reply.payload.bs_ext_hdr;
 SET_NEXT_STATE (%RECV_BS_HEADER);
 break;

@@ -437,6 +457,7 @@  REPLY.STRUCTURED_REPLY.RECV_BS_HEADER:
   struct command *cmd = h->reply_cmd;
   uint32_t length;
   uint32_t count;
+  uint16_t type;

   switch (recv_into_rbuf (h)) {
   case -1: SET_NEXT_STATE (%.DEAD); return 0;
@@ -446,24 +467,44 @@  REPLY.STRUCTURED_REPLY.RECV_BS_HEADER:
 return 0;
   case 0:
 length = h->sbuf.reply.hdr.structured.length; /* normalized in CHECK */
+type = be16toh (h->sbuf.reply.hdr.structured.type);

 assert (cmd); /* guaranteed by CHECK */
 assert (cmd->type == NBD_CMD_BLOCK_STATUS);
-assert (length >= 12);
-length -= sizeof h->sbuf.reply.payload.bs_hdr;
-count = length / (2 * sizeof (uint32_t));

-/* Read raw data into a subset of h->bs_entries, then expand it
+if (type == NBD_REPLY_TYPE_BLOCK_STATUS) {
+  assert (length >= 12);
+  length -= sizeof h->sbuf.reply.payload.bs_hdr;
+  count = length / (2 * sizeof (uint32_t));
+}
+else {
+  assert (type == NBD_REPLY_TYPE_BLOCK_STATUS_EXT);
+  assert (length >= 24);
+  length -= sizeof h->sbuf.reply.payload.bs_ext_hdr;
+  count = length / sizeof (struct nbd_block_descriptor_ext);
+  if (h->sbuf.reply.payload.bs_ext_hdr.count &&
+  count != be32toh (h->sbuf.reply.payload.bs_ext_hdr.count)) {
+h->rbuf = NULL;
+h->rlen = length;
+SET_NEXT_STATE (%RESYNC);
+return 0;
+  }
+}
+/* Normalize

[PATCH v2 2/6] spec: Tweak description of maximum block size

Commit 9f30fedb improved the spec to allow non-payload requests that
exceed any advertised maximum block size.  Take this one step further
by permitting the server to use NBD_EOVERFLOW as a hint to the client
when a request is oversize (while permitting NBD_EINVAL for
back-compat), and by rewording the text to explicitly call out that
what is being advertised is the maximum payload length, not maximum
block size.  This becomes more important when we add 64-bit
extensions, where it becomes possible to extend `NBD_CMD_BLOCK_STATUS`
to have both an effect length (how much of the image does the client
want status on - may be larger than 32 bits) and an optional payload
length (a way to filter the response to a subset of negotiated
metadata contexts).  In the shorter term, it means that a server may
(but not must) accept a read request larger than the maximum block
size if it can use structured replies to keep each chunk of the
response under the maximum payload limits.
---
 doc/proto.md | 127 +--
 1 file changed, 73 insertions(+), 54 deletions(-)

diff --git a/doc/proto.md b/doc/proto.md
index 8f08583..53c334a 100644
--- a/doc/proto.md
+++ b/doc/proto.md
@@ -745,8 +745,8 @@ text unless the client insists on TLS.

 During transmission phase, several operations are constrained by the
 export size sent by the final `NBD_OPT_EXPORT_NAME` or `NBD_OPT_GO`,
-as well as by three block size constraints defined here (minimum,
-preferred, and maximum).
+as well as by three block size constraints defined here (minimum
+block, preferred block, and maximum payload).

 If a client can honour server block size constraints (as set out below
 and under `NBD_INFO_BLOCK_SIZE`), it SHOULD announce this during the
@@ -772,15 +772,15 @@ learn the server's constraints without committing to them.

 If block size constraints have not been advertised or agreed on
 externally, then a server SHOULD support a default minimum block size
-of 1, a preferred block size of 2^12 (4,096), and a maximum block size
-that is effectively unlimited (0x, or the export size if that
-is smaller), while a client desiring maximum interoperability SHOULD
-constrain its requests to a minimum block size of 2^9 (512), and limit
-`NBD_CMD_READ` and `NBD_CMD_WRITE` commands to a maximum block size of
-2^25 (33,554,432).  A server that wants to enforce block sizes other
-than the defaults specified here MAY refuse to go into transmission
-phase with a client that uses `NBD_OPT_EXPORT_NAME` (via a hard
-disconnect) or which uses `NBD_OPT_GO` without requesting
+of 1, a preferred block size of 2^12 (4,096), and a maximum block
+payload size that is at least 2^25 (33,554,432) (even if the export
+size is smaller); while a client desiring maximum interoperability
+SHOULD constrain its requests to a minimum block size of 2^9 (512),
+and limit `NBD_CMD_READ` and `NBD_CMD_WRITE` commands to a maximum
+block size of 2^25 (33,554,432).  A server that wants to enforce block
+sizes other than the defaults specified here MAY refuse to go into
+transmission phase with a client that uses `NBD_OPT_EXPORT_NAME` (via
+a hard disconnect) or which uses `NBD_OPT_GO` without requesting
 `NBD_INFO_BLOCK_SIZE` (via an error reply of
 `NBD_REP_ERR_BLOCK_SIZE_REQD`); but servers SHOULD NOT refuse clients
 that do not request sizing information when the server supports
@@ -818,17 +818,40 @@ the preferred block size for that export.  The server MAY 
advertise an
 export size that is not an integer multiple of the preferred block
 size.

-The maximum block size represents the maximum length that the server
-is willing to handle in one request.  If advertised, it MAY be
-something other than a power of 2, but MUST be either an integer
-multiple of the minimum block size or the value 0x for no
-inherent limit, MUST be at least as large as the smaller of the
+The maximum block payload size represents the maximum payload length
+that the server is willing to handle in one request.  If advertised,
+it MAY be something other than a power of 2, but MUST be either an
+integer multiple of the minimum block size or the value 0x for
+no inherent limit, MUST be at least as large as the smaller of the
 preferred block size or export size, and SHOULD be at least 2^20
 (1,048,576) if the export is that large.  For convenience, the server
-MAY advertise a maximum block size that is larger than the export
-size, although in that case, the client MUST treat the export size as
-the effective maximum block size (as further constrained by a nonzero
-offset).
+MAY advertise a maximum block payload size that is larger than the
+export size, although in that case, the client MUST treat the export
+size as an effective maximum block size (as further constrained by a
+nonzero offset).  Notwithstanding any maximum block size advertised,
+either the server or the client MAY initiate a hard disconnect if a
+payload length of either a request or a r

[PATCH v2 11/15] nbd/client: Request extended headers during negotiation

All the pieces are in place for a client to finally request extended
headers.  Note that we must not request extended headers when qemu-nbd
is used to connect to the kernel module (as nbd.ko does not expect
them), but there is no harm in all other clients requesting them.

Extended headers are not essential to the information collected during
'qemu-nbd --list', but probing for it gives us one more piece of
information in that output.  Update the iotests affected by the new
line of output.

Signed-off-by: Eric Blake 
---
 nbd/client-connection.c   |  1 +
 nbd/client.c  | 35 +--
 qemu-nbd.c|  2 ++
 tests/qemu-iotests/223.out|  6 
 tests/qemu-iotests/233.out|  5 +++
 tests/qemu-iotests/241.out|  3 ++
 tests/qemu-iotests/307.out|  5 +++
 .../tests/nbd-qemu-allocation.out |  1 +
 8 files changed, 48 insertions(+), 10 deletions(-)

diff --git a/nbd/client-connection.c b/nbd/client-connection.c
index 0c5f917efa..3576190d09 100644
--- a/nbd/client-connection.c
+++ b/nbd/client-connection.c
@@ -93,6 +93,7 @@ NBDClientConnection *nbd_client_connection_new(const 
SocketAddress *saddr,

 .initial_info.request_sizes = true,
 .initial_info.structured_reply = true,
+.initial_info.extended_headers = true,
 .initial_info.base_allocation = true,
 .initial_info.x_dirty_bitmap = g_strdup(x_dirty_bitmap),
 .initial_info.name = g_strdup(export_name ?: "")
diff --git a/nbd/client.c b/nbd/client.c
index 70f06ce637..413304f553 100644
--- a/nbd/client.c
+++ b/nbd/client.c
@@ -878,12 +878,13 @@ static int nbd_list_meta_contexts(QIOChannel *ioc,
  *  1: server is newstyle, but can only accept EXPORT_NAME
  *  2: server is newstyle, but lacks structured replies
  *  3: server is newstyle and set up for structured replies
+ *  4: server is newstyle and set up for extended headers
  */
 static int nbd_start_negotiate(AioContext *aio_context, QIOChannel *ioc,
QCryptoTLSCreds *tlscreds,
const char *hostname, QIOChannel **outioc,
-   bool structured_reply, bool *zeroes,
-   Error **errp)
+   bool structured_reply, bool ext_hdrs,
+   bool *zeroes, Error **errp)
 {
 ERRP_GUARD();
 uint64_t magic;
@@ -960,15 +961,23 @@ static int nbd_start_negotiate(AioContext *aio_context, 
QIOChannel *ioc,
 if (fixedNewStyle) {
 int result = 0;

-if (structured_reply) {
+if (ext_hdrs) {
+result = nbd_request_simple_option(ioc,
+   NBD_OPT_EXTENDED_HEADERS,
+   false, errp);
+if (result) {
+return result < 0 ? -EINVAL : 4;
+}
+}
+if (structured_reply && !result) {
 result = nbd_request_simple_option(ioc,
NBD_OPT_STRUCTURED_REPLY,
false, errp);
-if (result < 0) {
-return -EINVAL;
+if (result) {
+return result < 0 ? -EINVAL : 3;
 }
 }
-return 2 + result;
+return 2;
 } else {
 return 1;
 }
@@ -1030,7 +1039,8 @@ int nbd_receive_negotiate(AioContext *aio_context, 
QIOChannel *ioc,
 trace_nbd_receive_negotiate_name(info->name);

 result = nbd_start_negotiate(aio_context, ioc, tlscreds, hostname, outioc,
- info->structured_reply, &zeroes, errp);
+ info->structured_reply,
+ info->extended_headers, &zeroes, errp);

 info->structured_reply = false;
 info->extended_headers = false;
@@ -1040,6 +1050,9 @@ int nbd_receive_negotiate(AioContext *aio_context, 
QIOChannel *ioc,
 }

 switch (result) {
+case 4: /* newstyle, with extended headers */
+info->extended_headers = true;
+/* fall through */
 case 3: /* newstyle, with structured replies */
 info->structured_reply = true;
 if (base_allocation) {
@@ -1151,7 +1164,7 @@ int nbd_receive_export_list(QIOChannel *ioc, 
QCryptoTLSCreds *tlscreds,

 *info = NULL;
 result = nbd_start_negotiate(NULL, ioc, tlscreds, hostname, &sioc, true,
- NULL, errp);
+ true, NULL, errp);
 if (tlscreds && sioc) {
 ioc = sioc;
 }
@@ -1159,6 +1172,7 @@ int nbd_receive_export_list(QIOChannel *ioc, 
QCryptoTLSCreds *tlscreds,
 switch (result) {
 case

[libnbd PATCH v2 19/23] api: Add nbd_[aio_]opt_extended_headers()

Very similar to the recent addition of nbd_opt_structured_reply,
giving us fine-grained control over an extended headers request.

Because nbdkit does not yet support extended headers, testsuite
coverage is limited to interop testing with qemu-nbd.  It shows that
extended headers imply structured replies, regardless of which order
the two are manually negotiated in.
---
 generator/API.ml  |  79 +++--
 .../states-newstyle-opt-extended-headers.c|  30 +++-
 generator/states-newstyle.c   |   3 +
 lib/opt.c |  44 +
 interop/Makefile.am   |   6 +
 interop/opt-extended-headers.c| 153 ++
 interop/opt-extended-headers.sh   |  29 
 .gitignore|   1 +
 8 files changed, 329 insertions(+), 16 deletions(-)
 create mode 100644 interop/opt-extended-headers.c
 create mode 100755 interop/opt-extended-headers.sh

diff --git a/generator/API.ml b/generator/API.ml
index 570f15f4..c4d15e3a 100644
--- a/generator/API.ml
+++ b/generator/API.ml
@@ -825,6 +825,7 @@   "set_request_extended_headers", {
 if L is also set to false,
 since the use of extended headers implies structured replies.";
 see_also = [Link "get_request_extended_headers";
+Link "opt_extended_headers";
 Link "set_handshake_flags"; Link "set_strict_mode";
 Link "get_extended_headers_negotiated";
 Link "zero"; Link "trim"; Link "cache";
@@ -856,7 +857,9 @@   "get_extended_headers_negotiated", {
 shortdesc = "see if extended headers are in use";
 longdesc = "\
 After connecting you may call this to find out if the connection is
-using extended headers.
+using extended headers.  Note that this setting is sticky; this
+can return true even after a second L
+returns false because the server detected a duplicate request.

 When extended headers are not in use, commands are limited to a
 32-bit length, even when the libnbd API uses a 64-bit parameter
@@ -938,7 +941,7 @@   "get_structured_replies_negotiated", {
 attempted.";
 see_also = [Link "set_request_structured_replies";
 Link "get_request_structured_replies";
-Link "opt_structured_reply";
+Link "opt_structured_reply"; Link "opt_extended_headers";
 Link "get_protocol";
 Link "get_extended_headers_negotiated"];
   };
@@ -1211,12 +1214,13 @@   "set_opt_mode", {
 newstyle server.  This setting has no effect when connecting to an
 oldstyle server.

-Note that libnbd defaults to attempting C and
-C before letting you control remaining
-negotiation steps; if you need control over these steps as well,
-first set L to C and
-L to false before starting
-the connection attempt.
+Note that libnbd defaults to attempting C,
+C, and C
+before letting you control remaining negotiation steps; if you
+need control over these steps as well, first set L
+to C, and L
+or L to false, before
+starting the connection attempt.

 When option mode is enabled, you have fine-grained control over which
 options are negotiated, compared to the default of the server
@@ -1324,6 +1328,35 @@   "opt_starttls", {
 Link "supports_tls"]
   };

+  "opt_extended_headers", {
+default_call with
+args = []; ret = RBool;
+permitted_states = [ Negotiating ];
+shortdesc = "request the server to enable extended headers";
+longdesc = "\
+Request that the server use extended headers, by sending
+C.  This can only be used if
+L enabled option mode; furthermore, libnbd
+defaults to automatically requesting this unless you use
+L or
+L prior to connecting.
+This function is mainly useful for integration testing of corner
+cases in server handling.
+
+This function returns true if the server replies with success,
+false if the server replies with an error, and fails only if
+the server does not reply (such as for a loss of connection).
+Note that some servers fail a second request as redundant;
+libnbd assumes that once one request has succeeded, then
+extended headers are supported (as visible by
+L) regardless if
+later calls to this function return false.  If this function
+returns true, the use of structured replies is implied.";
+see_also = [Link "set_opt_mode"; Link "aio_opt_extended_headers";
+Link "opt_go"; Link "set_request_extended_headers";
+Link "set_request_structured_replies"]
+  };
+
   "opt_structured_reply", {
 default_call with
 args = []; ret = RBool;
@@ -1345,7 +1378,9 @@   "opt_structured_reply", {
 libnbd assumes that once one request has succeeded, then
 structured replies are supported (as visible by
 L) regardless if
-later calls to this function return false.";
+later calls to this function return false.  Similarly, a
+server may fail this request if extended headers are already
+negotiated, since extended headers

Re: [PATCH v9 1/8] mm: Introduce memfd_restricted system call to create restricted user memory

2022-11-14 Thread Michael Roth

On Mon, Nov 14, 2022 at 03:02:37PM +0100, Vlastimil Babka wrote:
> On 11/1/22 16:19, Michael Roth wrote:
> > On Tue, Nov 01, 2022 at 07:37:29PM +0800, Chao Peng wrote:
> >> > 
> >> >   1) restoring kernel directmap:
> >> > 
> >> >  Currently SNP (and I believe TDX) need to either split or remove 
> >> > kernel
> >> >  direct mappings for restricted PFNs, since there is no guarantee 
> >> > that
> >> >  other PFNs within a 2MB range won't be used for non-restricted
> >> >  (which will cause an RMP #PF in the case of SNP since the 2MB
> >> >  mapping overlaps with guest-owned pages)
> >> 
> >> Has the splitting and restoring been a well-discussed direction? I'm
> >> just curious whether there is other options to solve this issue.
> > 
> > For SNP it's been discussed for quite some time, and either splitting or
> > removing private entries from directmap are the well-discussed way I'm
> > aware of to avoid RMP violations due to some other kernel process using
> > a 2MB mapping to access shared memory if there are private pages that
> > happen to be within that range.
> > 
> > In both cases the issue of how to restore directmap as 2M becomes a
> > problem.
> > 
> > I was also under the impression TDX had similar requirements. If so,
> > do you know what the plan is for handling this for TDX?
> > 
> > There are also 2 potential alternatives I'm aware of, but these haven't
> > been discussed in much detail AFAIK:
> > 
> > a) Ensure confidential guests are backed by 2MB pages. shmem has a way to
> >request 2MB THP pages, but I'm not sure how reliably we can guarantee
> >that enough THPs are available, so if we went that route we'd probably
> >be better off requiring the use of hugetlbfs as the backing store. But
> >obviously that's a bit limiting and it would be nice to have the option
> >of using normal pages as well. One nice thing with invalidation
> >scheme proposed here is that this would "Just Work" if implement
> >hugetlbfs support, so an admin that doesn't want any directmap
> >splitting has this option available, otherwise it's done as a
> >best-effort.
> > 
> > b) Implement general support for restoring directmap as 2M even when
> >subpages might be in use by other kernel threads. This would be the
> >most flexible approach since it requires no special handling during
> >invalidations, but I think it's only possible if all the CPA
> >attributes for the 2M range are the same at the time the mapping is
> >restored/unsplit, so some potential locking issues there and still
> >chance for splitting directmap over time.
> 
> I've been hoping that
> 
> c) using a mechanism such as [1] [2] where the goal is to group together
> these small allocations that need to increase directmap granularity so
> maximum number of large mappings are preserved. But I guess that means

Thanks for the references. I wasn't aware there was work in this area,
this opens up some possibilities on how to approach this.

> maximum number of large mappings are preserved. But I guess that means
> knowing at allocation time that this will happen. So I've been wondering how
> this would be possible to employ in the SNP/UPM case? I guess it depends on
> how we expect the private/shared conversions to happen in practice, and I
> don't know the details. I can imagine the following complications:
> 
> - a memfd_restricted region is created such that it's 2MB large/aligned,
> i.e. like case a) above, we can allocate it normally. Now, what if a 4k page
> in the middle is to be temporarily converted to shared for some
> communication between host and guest (can such thing happen?). With the
> punch hole approach, I wonder if we end up fragmenting directmap
> unnecessarily? IIUC the now shared page will become backed by some other

Yes, we end up fragmenting in cases where a guest converts a sub-page to a
shared page because the fallocate(PUNCH_HOLE) gets forwarded through to shmem
which will then split it. At that point the subpage might get used elsewhere
so we no longer have the ability to restore as 2M after
invalidation/shutdown. We could potentially just intercept those
fallocate()'s and only issue the invalidation once all the subpages have
been PUNCH_HOLE'd. We'd still need to ensure KVM MMU invalidations
happen immediately though, but since we rely on a KVM ioctl to do the
conversion in advance, we can rely on the KVM MMU invalidation that
happens at that point and simply make fallocate(PUNCH_HOLE) fail if
someone attempts it on a page that hasn't been converted to shared yet.

Otherwise we could end up being an good chunk of pages depending on how
guest allocates shared pages, but I'm slightly less concerned about that
seeing as there are some general solutions to directmap fragmentation
being considered. I need to think more how this hooks would tie in to
that though.

And since we'd only really being able to avoid unrecoverable splits if
the restrictedmem is

Re: [PATCH 02/13] test-bdrv-drain: Don't yield in .bdrv_co_drained_begin/end()


On 08.11.22 13:37, Kevin Wolf wrote:

We want to change .bdrv_co_drained_begin/end() back to be non-coroutine
callbacks, so in preparation, avoid yielding in their implementation.

This does almost the same as the existing logic in bdrv_drain_invoke(),
by creating and entering coroutines internally. However, since the test
case is by far the heaviest user of coroutine code in drain callbacks,
it is preferable to have the complexity in the test case rather than the
drain core, which is already complicated enough without this.

The behaviour for bdrv_drain_begin() is unchanged because we increase
bs->in_flight and this is still polled. However, bdrv_drain_end()
doesn't wait for the spawned coroutine to complete any more. This is
fine, we don't rely on bdrv_drain_end() restarting all operations
immediately before the next aio_poll().

Signed-off-by: Kevin Wolf 
---
  tests/unit/test-bdrv-drain.c | 64 ++--
  1 file changed, 46 insertions(+), 18 deletions(-)


Reviewed-by: Hanna Reitz

[libnbd PATCH v2 12/23] copy: Update nbdcopy to use 64-bit block status

Although our use of "base:allocation" doesn't require the use of the
64-bit API for flags, we might perform slightly faster for a server
that does give us 64-bit extent lengths and honors larger nbd_zero
lengths.
---
 copy/nbd-ops.c | 22 +++---
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/copy/nbd-ops.c b/copy/nbd-ops.c
index 34ab4857..dad78ea9 100644
--- a/copy/nbd-ops.c
+++ b/copy/nbd-ops.c
@@ -428,7 +428,7 @@ nbd_ops_asynch_notify_write (struct rw *rw, size_t index)
  * request for extents in a single round trip.
  */
 static int add_extent (void *vp, const char *metacontext,
-   uint64_t offset, uint32_t *entries, size_t nr_entries,
+   uint64_t offset, nbd_extent *entries, size_t nr_entries,
int *error);

 static void
@@ -449,11 +449,11 @@ nbd_ops_get_extents (struct rw *rw, size_t index,
 size_t i;

 exts.len = 0;
-if (nbd_block_status (nbd, count, offset,
-  (nbd_extent_callback) {
-.user_data = &exts,
-.callback = add_extent
-  }, 0) == -1) {
+if (nbd_block_status_64 (nbd, count, offset,
+ (nbd_extent64_callback) {
+   .user_data = &exts,
+   .callback = add_extent
+ }, 0) == -1) {
   /* XXX We could call default_get_extents, but unclear if it's
* the right thing to do if the server is returning errors.
*/
@@ -493,7 +493,7 @@ nbd_ops_get_extents (struct rw *rw, size_t index,

 static int
 add_extent (void *vp, const char *metacontext,
-uint64_t offset, uint32_t *entries, size_t nr_entries,
+uint64_t offset, nbd_extent *entries, size_t nr_entries,
 int *error)
 {
   extent_list *ret = vp;
@@ -502,25 +502,25 @@ add_extent (void *vp, const char *metacontext,
   if (strcmp (metacontext, "base:allocation") != 0 || *error)
 return 0;

-  for (i = 0; i < nr_entries; i += 2) {
+  for (i = 0; i < nr_entries; i++) {
 struct extent e;

 e.offset = offset;
-e.length = entries[i];
+e.length = entries[i].length;

 /* Note we deliberately don't care about the HOLE flag.  There is
  * no need to read extent that reads as zeroes.  We will convert
  * to it to a hole or allocated extents based on the command line
  * arguments.
  */
-e.zero = (entries[i+1] & LIBNBD_STATE_ZERO) != 0;
+e.zero = (entries[i].flags & LIBNBD_STATE_ZERO) != 0;

 if (extent_list_append (ret, e) == -1) {
   perror ("realloc");
   exit (EXIT_FAILURE);
 }

-offset += entries[i];
+offset += entries[i].length;
   }

   return 0;
-- 
2.38.1

[PATCH v2 14/15] RFC: nbd/client: Accept 64-bit hole chunks

As part of adding extended headers, the NBD spec debated about adding
support for reading 64-bit holes.  It was documented in a separate
upstream commit XXX[*] to make it easier to decide whether 64-bit
holes should be required of all clients supporting extended headers,
or whether it is an unneeded feature; hence, the qemu work to support
it is also pulled out into a separate commit.

Note that we can also tolerate a non-compliant server sending the new
chunk even when it should not.

Signed-off-by: Eric Blake 

---
[*] Fix commit id if we actually go with idea
---
 include/block/nbd.h |  8 
 block/nbd.c | 26 --
 nbd/common.c|  4 +++-
 3 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/include/block/nbd.h b/include/block/nbd.h
index 2a65c606c9..18b6bad038 100644
--- a/include/block/nbd.h
+++ b/include/block/nbd.h
@@ -133,6 +133,13 @@ typedef struct NBDStructuredReadHole {
 uint32_t length;
 } QEMU_PACKED NBDStructuredReadHole;

+/* Complete chunk for NBD_REPLY_TYPE_OFFSET_HOLE_EXT */
+typedef struct NBDStructuredReadHoleExt {
+/* header's length == 16 */
+uint64_t offset;
+uint64_t length;
+} QEMU_PACKED NBDStructuredReadHoleExt;
+
 /* Header of all NBD_REPLY_TYPE_ERROR* errors */
 typedef struct NBDStructuredError {
 /* header's length >= 6 */
@@ -309,6 +316,7 @@ enum {
 #define NBD_REPLY_TYPE_NONE  0
 #define NBD_REPLY_TYPE_OFFSET_DATA   1
 #define NBD_REPLY_TYPE_OFFSET_HOLE   2
+#define NBD_REPLY_TYPE_OFFSET_HOLE_EXT   3
 #define NBD_REPLY_TYPE_BLOCK_STATUS  5
 #define NBD_REPLY_TYPE_BLOCK_STATUS_EXT  6
 #define NBD_REPLY_TYPE_ERROR NBD_REPLY_ERR(1)
diff --git a/block/nbd.c b/block/nbd.c
index 44ab5437ea..968d5d8a37 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -570,20 +570,26 @@ static inline uint64_t payload_advance64(uint8_t 
**payload)

 static int nbd_parse_offset_hole_payload(BDRVNBDState *s,
  NBDStructuredReplyChunk *chunk,
- uint8_t *payload, uint64_t 
orig_offset,
+ uint8_t *payload, bool wide,
+ uint64_t orig_offset,
  QEMUIOVector *qiov, Error **errp)
 {
 uint64_t offset;
-uint32_t hole_size;
+uint64_t hole_size;
+size_t len = wide ? sizeof(hole_size) : sizeof(uint32_t);

-if (chunk->length != sizeof(offset) + sizeof(hole_size)) {
+if (chunk->length != sizeof(offset) + len) {
 error_setg(errp, "Protocol error: invalid payload for "
  "NBD_REPLY_TYPE_OFFSET_HOLE");
 return -EINVAL;
 }

 offset = payload_advance64(&payload);
-hole_size = payload_advance32(&payload);
+if (wide) {
+hole_size = payload_advance64(&payload);
+} else {
+hole_size = payload_advance32(&payload);
+}

 if (!hole_size || offset < orig_offset || hole_size > qiov->size ||
 offset > orig_offset + qiov->size - hole_size) {
@@ -596,6 +602,7 @@ static int nbd_parse_offset_hole_payload(BDRVNBDState *s,
 trace_nbd_structured_read_compliance("hole");
 }

+assert(hole_size <= SIZE_MAX);
 qemu_iovec_memset(qiov, offset - orig_offset, 0, hole_size);

 return 0;
@@ -1094,9 +1101,16 @@ static int coroutine_fn 
nbd_co_receive_cmdread_reply(BDRVNBDState *s, uint64_t h
  * in qiov
  */
 break;
+case NBD_REPLY_TYPE_OFFSET_HOLE_EXT:
+if (!s->info.extended_headers) {
+trace_nbd_extended_headers_compliance("hole_ext");
+}
+/* fallthrough */
 case NBD_REPLY_TYPE_OFFSET_HOLE:
-ret = nbd_parse_offset_hole_payload(s, &reply.structured, payload,
-offset, qiov, &local_err);
+ret = nbd_parse_offset_hole_payload(
+s, &reply.structured, payload,
+chunk->type == NBD_REPLY_TYPE_OFFSET_HOLE_EXT,
+offset, qiov, &local_err);
 if (ret < 0) {
 nbd_channel_error(s, ret);
 nbd_iter_channel_error(&iter, ret, &local_err);
diff --git a/nbd/common.c b/nbd/common.c
index 137466defd..54f7d6a4fd 100644
--- a/nbd/common.c
+++ b/nbd/common.c
@@ -174,7 +174,9 @@ const char *nbd_reply_type_lookup(uint16_t type)
 case NBD_REPLY_TYPE_OFFSET_DATA:
 return "data";
 case NBD_REPLY_TYPE_OFFSET_HOLE:
-return "hole";
+return "hole (32-bit)";
+case NBD_REPLY_TYPE_OFFSET_HOLE_EXT:
+return "hole (64-bit)";
 case NBD_REPLY_TYPE_BLOCK_STATUS:
 return "block status (32-bit)";
 case NBD_REPLY_TYPE_BLOCK_STATUS_EXT:
-- 
2.38.1

[PATCH v2 06/15] nbd/server: Refactor to pass full request around

Part of NBD's 64-bit headers extension involves passing the client's
requested offset back as part of the reply header (one reason for this
change: converting absolute offsets stored in
NBD_REPLY_TYPE_OFFSET_DATA to relative offsets within the buffer is
easier if the absolute offset of the buffer is also available).  This
is a refactoring patch to pass the full request around the reply
stack, rather than just the handle, so that later patches can then
access request->from when extended headers are active.  But for this
patch, there are no semantic changes.

Signed-off-by: Eric Blake 
---
 nbd/server.c | 109 ++-
 1 file changed, 55 insertions(+), 54 deletions(-)

diff --git a/nbd/server.c b/nbd/server.c
index ad5c2052b5..4d1400430b 100644
--- a/nbd/server.c
+++ b/nbd/server.c
@@ -1890,18 +1890,17 @@ static int coroutine_fn nbd_co_send_iov(NBDClient 
*client, struct iovec *iov,
 }

 static inline void set_be_simple_reply(NBDClient *client, struct iovec *iov,
-   uint64_t error, uint64_t handle)
+   uint64_t error, NBDRequest *request)
 {
 NBDSimpleReply *reply = iov->iov_base;

 iov->iov_len = sizeof(*reply);
 stl_be_p(&reply->magic, NBD_SIMPLE_REPLY_MAGIC);
 stl_be_p(&reply->error, error);
-stq_be_p(&reply->handle, handle);
+stq_be_p(&reply->handle, request->handle);
 }

-static int nbd_co_send_simple_reply(NBDClient *client,
-uint64_t handle,
+static int nbd_co_send_simple_reply(NBDClient *client, NBDRequest *request,
 uint32_t error,
 void *data,
 size_t len,
@@ -1914,16 +1913,16 @@ static int nbd_co_send_simple_reply(NBDClient *client,
 {.iov_base = data, .iov_len = len}
 };

-trace_nbd_co_send_simple_reply(handle, nbd_err, nbd_err_lookup(nbd_err),
-   len);
-set_be_simple_reply(client, &iov[0], nbd_err, handle);
+trace_nbd_co_send_simple_reply(request->handle, nbd_err,
+   nbd_err_lookup(nbd_err), len);
+set_be_simple_reply(client, &iov[0], nbd_err, request);

 return nbd_co_send_iov(client, iov, len ? 2 : 1, errp);
 }

 static inline void set_be_chunk(NBDClient *client, struct iovec *iov,
 uint16_t flags, uint16_t type,
-uint64_t handle, uint32_t length)
+NBDRequest *request, uint32_t length)
 {
 NBDStructuredReplyChunk *chunk = iov->iov_base;

@@ -1931,12 +1930,12 @@ static inline void set_be_chunk(NBDClient *client, 
struct iovec *iov,
 stl_be_p(&chunk->magic, NBD_STRUCTURED_REPLY_MAGIC);
 stw_be_p(&chunk->flags, flags);
 stw_be_p(&chunk->type, type);
-stq_be_p(&chunk->handle, handle);
+stq_be_p(&chunk->handle, request->handle);
 stl_be_p(&chunk->length, length);
 }

 static int coroutine_fn nbd_co_send_structured_done(NBDClient *client,
-uint64_t handle,
+NBDRequest *request,
 Error **errp)
 {
 NBDReply hdr;
@@ -1944,15 +1943,15 @@ static int coroutine_fn 
nbd_co_send_structured_done(NBDClient *client,
 {.iov_base = &hdr},
 };

-trace_nbd_co_send_structured_done(handle);
+trace_nbd_co_send_structured_done(request->handle);
 set_be_chunk(client, &iov[0], NBD_REPLY_FLAG_DONE,
- NBD_REPLY_TYPE_NONE, handle, 0);
+ NBD_REPLY_TYPE_NONE, request, 0);

 return nbd_co_send_iov(client, iov, 1, errp);
 }

 static int coroutine_fn nbd_co_send_structured_read(NBDClient *client,
-uint64_t handle,
+NBDRequest *request,
 uint64_t offset,
 void *data,
 size_t size,
@@ -1968,16 +1967,16 @@ static int coroutine_fn 
nbd_co_send_structured_read(NBDClient *client,
 };

 assert(size);
-trace_nbd_co_send_structured_read(handle, offset, data, size);
+trace_nbd_co_send_structured_read(request->handle, offset, data, size);
 set_be_chunk(client, &iov[0], final ? NBD_REPLY_FLAG_DONE : 0,
- NBD_REPLY_TYPE_OFFSET_DATA, handle, iov[1].iov_len + size);
+ NBD_REPLY_TYPE_OFFSET_DATA, request, iov[1].iov_len + size);
 stq_be_p(&chunk.offset, offset);

 return nbd_co_send_iov(client, iov, 3, errp);
 }

 static int coroutine_fn nbd_co_send_structured_error(NBDClient *client,
- uint64_t handle,
+ NBDRequest

Re: [PATCH 3/4] hw/cxl: Multi-Region CXL Type-3 Devices (Volatile and Persistent)

2022-11-14 Thread Gregory Price

On Mon, Nov 14, 2022 at 05:53:41PM +, Jonathan Cameron wrote:
> Hi Gregory,
> 
> I've not been rushing on this purely because we are after the feature
> freeze for this QEMU cycle so no great rush to line up new features
> (and there was some fun with the pull request the previous set of QEMU
> CXL features were in - unrelated to those patches).
> 
> A few comments inline.
> 
> Once I've chased down a segfault on power down of my qemu setup (that
> seems to have nothing to do with the CXL support. *sigh*) I'll push out
> an updated tree with this on it for testing purposes.
> 
> Thanks,
> 
> Jonathan
> 

All good, I've been wrapped up in other work.  Just ping me when you are
pushing a new branch and i'll gladly rebased and address the notes.

Regards
Gregory

[libnbd PATCH v2 22/23] api: Add nbd_[aio_]block_status_filter()

As part of extending NBD to support 64-bit lengths, the protocol also
added an option for servers to allow clients to request filtered
responses to NBD_CMD_BLOCK_STATUS when more than one meta-context is
negotiated (see NBD commit XXX[*]).  At the same time as this patch,
qemu-nbd was taught to support and advertise this feature as a server,
but does not utilize it as a client (qemu doesn't yet need to connect
to multiple contexts at once).  Thus, addding generic client support
and enhancing the interop/ test in libnbd is needed to prove that the
feature is viable and worth standardizing.

---
[*] FIXME with actual commit id
---
 lib/internal.h   |   5 +-
 generator/API.ml |  71 +++--
 generator/states-issue-command.c |   4 +-
 lib/aio.c|   7 +-
 lib/rw.c | 127 ++-
 interop/block-status-payload.c   | 117 +++-
 interop/block-status-payload.sh  |  14 +++-
 info/info-can.sh |   3 +
 8 files changed, 336 insertions(+), 12 deletions(-)

diff --git a/lib/internal.h b/lib/internal.h
index 1abb21cb..ac8d99c4 100644
--- a/lib/internal.h
+++ b/lib/internal.h
@@ -73,6 +73,8 @@ struct meta_context {
 };
 DEFINE_VECTOR_TYPE(meta_vector, struct meta_context);

+DEFINE_VECTOR_TYPE(uint32_vector, uint32_t);
+
 struct export {
   char *name;
   char *description;
@@ -379,7 +381,8 @@ struct command {
   uint64_t cookie;
   uint64_t offset;
   uint64_t count;
-  void *data; /* Buffer for read/write */
+  void *data; /* Buffer for read/write, uint32_vector* for status payload */
+  uint32_vector *ids; /* For block status with payload */
   struct command_cb cb;
   bool initialized; /* For read, true if getting a hole may skip memset */
   uint32_t data_seen; /* For read, cumulative size of data chunks seen */
diff --git a/generator/API.ml b/generator/API.ml
index bbf7c0bb..6bf67de0 100644
--- a/generator/API.ml
+++ b/generator/API.ml
@@ -2287,12 +2287,13 @@   "can_block_status_payload", {
 longdesc = "\
 Returns true if the server supports the use of the
 C flag to allow filtering of the
-block status command.  Returns
+block status command (see L).  Returns
 false if the server does not.  Note that this will never return
 true if L is false."
 ^ non_blocking_test_call_description;
 see_also = [SectionLink "Flag calls"; Link "opt_info";
-Link "get_extended_headers_negotiated"];
+Link "get_extended_headers_negotiated";
+Link "block_status_filter"];
 example = Some "examples/server-flags.c";
   };

@@ -2361,6 +2362,10 @@   "can_meta_context", {
 meta contexts were requested but there is a missing or failed
 attempt at NBD_OPT_SET_META_CONTEXT during option negotiation.

+If the server supports block status filtering (see
+L, this function must return
+true for any filter name passed to L.
+
 The single parameter is the name of the metadata context,
 for example C.
 Blibnbd.hE> includes defined constants for well-known
@@ -2893,9 +2898,12 @@   "block_status_64", {
 information about blocks beginning from the specified
 offset to be returned. The C parameter is a hint: the
 server may choose to return less status, or the final block
-may extend beyond the requested range. If multiple contexts
+may extend beyond the requested range. When multiple contexts
 are supported, the number of blocks and cumulative length
-of those blocks need not be identical between contexts.
+of those blocks need not be identical between contexts; this
+command generally returns the status of all negotiated contexts,
+while some servers also support a filtered request (see
+L, L).

 Note that not all servers can support a C of 4GiB or larger;
 L indicates which servers
@@ -2945,11 +2953,38 @@   "block_status_64", {
 does not exceed C bytes; however, libnbd does not
 validate that the server obeyed the flag."
 ^ strict_call_description;
-see_also = [Link "block_status";
+see_also = [Link "block_status"; Link "block_status_filter";
 Link "add_meta_context"; Link "can_meta_context";
 Link "aio_block_status_64"; Link "set_strict_mode"];
   };

+  "block_status_filter", {
+default_call with
+args = [ UInt64 "count"; UInt64 "offset"; StringList "contexts";
+ Closure extent64_closure ];
+optargs = [ OFlags ("flags", cmd_flags, Some ["REQ_ONE"; "PAYLOAD_LEN"]) ];
+ret = RErr;
+permitted_states = [ Connected ];
+shortdesc = "send filtered block status command, with 64-bit callback";
+longdesc = "\
+Issue a filtered block status command to the NBD server.  If
+supported by the server (see L),
+this causes metadata context information about blocks beginning
+from the specified offset to be returned, and with the result
+limited to just the contexts specified in C.  Note that
+all strings in C must be supported by
+L.
+
+All other parameters to this function have the sam

[PATCH v2 10/15] nbd/client: Accept 64-bit block status chunks

Because we use NBD_CMD_FLAG_REQ_ONE with NBD_CMD_BLOCK_STATUS, a
client in narrow mode should not be able to provoke a server into
sending a block status result larger than the client's 32-bit request.
But in extended mode, a 64-bit status request must be able to handle a
64-bit status result, once a future patch enables the client
requesting extended mode.  We can also tolerate a non-compliant server
sending the new chunk even when it should not.

In normal execution, we are only requesting "base:allocation" which
never exceeds 32 bits. But during testing with x-dirty-bitmap, we can
force qemu to connect to some other context that might have 64-bit
status bit; however, we ignore those upper bits (other than mapping
qemu:allocation-depth into something that 'qemu-img map --output=json'
can expose), and since it is only testing, we really don't bother with
checking whether more than the two least-significant bits are set.

Signed-off-by: Eric Blake 
---
 block/nbd.c| 38 +++---
 block/trace-events |  1 +
 2 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/block/nbd.c b/block/nbd.c
index a8b1bc1054..44ab5437ea 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -608,13 +608,16 @@ static int nbd_parse_offset_hole_payload(BDRVNBDState *s,
  */
 static int nbd_parse_blockstatus_payload(BDRVNBDState *s,
  NBDStructuredReplyChunk *chunk,
- uint8_t *payload, uint64_t 
orig_length,
- NBDExtent *extent, Error **errp)
+ uint8_t *payload, bool wide,
+ uint64_t orig_length,
+ NBDExtentExt *extent, Error **errp)
 {
 uint32_t context_id;
+uint32_t count = 0;
+size_t len = wide ? sizeof(*extent) : sizeof(NBDExtent);

 /* The server succeeded, so it must have sent [at least] one extent */
-if (chunk->length < sizeof(context_id) + sizeof(*extent)) {
+if (chunk->length < sizeof(context_id) + wide * sizeof(count) + len) {
 error_setg(errp, "Protocol error: invalid payload for "
  "NBD_REPLY_TYPE_BLOCK_STATUS");
 return -EINVAL;
@@ -629,8 +632,14 @@ static int nbd_parse_blockstatus_payload(BDRVNBDState *s,
 return -EINVAL;
 }

-extent->length = payload_advance32(&payload);
-extent->flags = payload_advance32(&payload);
+if (wide) {
+count = payload_advance32(&payload);
+extent->length = payload_advance64(&payload);
+extent->flags = payload_advance64(&payload);
+} else {
+extent->length = payload_advance32(&payload);
+extent->flags = payload_advance32(&payload);
+}

 if (extent->length == 0) {
 error_setg(errp, "Protocol error: server sent status chunk with "
@@ -670,7 +679,8 @@ static int nbd_parse_blockstatus_payload(BDRVNBDState *s,
  * connection; just ignore trailing extents, and clamp things to
  * the length of our request.
  */
-if (chunk->length > sizeof(context_id) + sizeof(*extent)) {
+if (count > 1 ||
+chunk->length > sizeof(context_id) + wide * sizeof(count) + len) {
 trace_nbd_parse_blockstatus_compliance("more than one extent");
 }
 if (extent->length > orig_length) {
@@ -1114,7 +1124,7 @@ static int coroutine_fn 
nbd_co_receive_cmdread_reply(BDRVNBDState *s, uint64_t h

 static int coroutine_fn nbd_co_receive_blockstatus_reply(BDRVNBDState *s,
  uint64_t handle, 
uint64_t length,
- NBDExtent *extent,
+ NBDExtentExt *extent,
  int *request_ret, 
Error **errp)
 {
 NBDReplyChunkIter iter;
@@ -1131,6 +1141,11 @@ static int coroutine_fn 
nbd_co_receive_blockstatus_reply(BDRVNBDState *s,
 assert(nbd_reply_is_structured(&reply));

 switch (chunk->type) {
+case NBD_REPLY_TYPE_BLOCK_STATUS_EXT:
+if (!s->info.extended_headers) {
+trace_nbd_extended_headers_compliance("block_status_ext");
+}
+/* fallthrough */
 case NBD_REPLY_TYPE_BLOCK_STATUS:
 if (received) {
 nbd_channel_error(s, -EINVAL);
@@ -1139,9 +1154,10 @@ static int coroutine_fn 
nbd_co_receive_blockstatus_reply(BDRVNBDState *s,
 }
 received = true;

-ret = nbd_parse_blockstatus_payload(s, &reply.structured,
-payload, length, extent,
-&local_err);
+ret = nbd_parse_blockstatus_payload(
+s, &reply.structured, payload,
+chunk->type == NBD_REPLY_TYPE_BLOCK_STATUS_EXT,
+length,

[libnbd PATCH v2 21/23] api: Add nbd_can_block_status_payload()

In the recent NBD protocol extensions to add 64-bit commands, an
additional option was added to allow NBD_CMD_BLOCK_STATUS pass a
client payload instructing the server to filter its answers (mainly
useful when the client requests more than one meta context with
NBD_OPT_SET_META_CONTEXT).  This patch lays the groundwork by exposing
servers that advertise this capability, although libnbd does not yet
actually utilize it until the next patch.

At the time this patch was written, qemu-nbd was also patched to
provide such support; hence, an interop/ test shows the API in action.
---
 info/nbdinfo.pod|  10 ++-
 lib/nbd-protocol.h  |  29 +---
 generator/API.ml|  18 +
 lib/flags.c |  11 +++
 examples/server-flags.c |   7 +-
 interop/Makefile.am |   6 ++
 interop/block-status-payload.c  | 126 
 interop/block-status-payload.sh |  68 +
 .gitignore  |   1 +
 info/can.c  |   5 ++
 info/show.c |   9 ++-
 11 files changed, 273 insertions(+), 17 deletions(-)
 create mode 100644 interop/block-status-payload.c
 create mode 100755 interop/block-status-payload.sh

diff --git a/info/nbdinfo.pod b/info/nbdinfo.pod
index 2455e1c0..c3a97c3b 100644
--- a/info/nbdinfo.pod
+++ b/info/nbdinfo.pod
@@ -178,6 +178,8 @@ rotating disk: accessing nearby blocks may be faster than 
random
 access and requests should be sorted to improve performance.  Many
 servers do not or cannot report this accurately.

+=item nbdinfo --can block-status-payload URI
+
 =item nbdinfo --can cache URI

 =item nbdinfo --can df URI
@@ -345,10 +347,10 @@ The command does not print anything.  Instead it exits 
with success

 For further information see the Lhttps://github.com/NetworkBlockDevice/nbd/blob/master/doc/proto.md>
-and the following libnbd functions: L,
-L, L, L,
-L, L, L,
-L, L,
+and the following libnbd functions: L,
+L, L, L,
+L, L, L,
+L, L, L,
 L,
 L.

diff --git a/lib/nbd-protocol.h b/lib/nbd-protocol.h
index ac569a11..2d1fabd0 100644
--- a/lib/nbd-protocol.h
+++ b/lib/nbd-protocol.h
@@ -102,17 +102,18 @@ struct nbd_fixed_new_option_reply {
 #define NBD_FLAG_NO_ZEROES (1 << 1)

 /* Per-export flags. */
-#define NBD_FLAG_HAS_FLAGS (1 << 0)
-#define NBD_FLAG_READ_ONLY (1 << 1)
-#define NBD_FLAG_SEND_FLUSH(1 << 2)
-#define NBD_FLAG_SEND_FUA  (1 << 3)
-#define NBD_FLAG_ROTATIONAL(1 << 4)
-#define NBD_FLAG_SEND_TRIM (1 << 5)
-#define NBD_FLAG_SEND_WRITE_ZEROES (1 << 6)
-#define NBD_FLAG_SEND_DF   (1 << 7)
-#define NBD_FLAG_CAN_MULTI_CONN(1 << 8)
-#define NBD_FLAG_SEND_CACHE(1 << 10)
-#define NBD_FLAG_SEND_FAST_ZERO(1 << 11)
+#define NBD_FLAG_HAS_FLAGS(1 << 0)
+#define NBD_FLAG_READ_ONLY(1 << 1)
+#define NBD_FLAG_SEND_FLUSH   (1 << 2)
+#define NBD_FLAG_SEND_FUA (1 << 3)
+#define NBD_FLAG_ROTATIONAL   (1 << 4)
+#define NBD_FLAG_SEND_TRIM(1 << 5)
+#define NBD_FLAG_SEND_WRITE_ZEROES(1 << 6)
+#define NBD_FLAG_SEND_DF  (1 << 7)
+#define NBD_FLAG_CAN_MULTI_CONN   (1 << 8)
+#define NBD_FLAG_SEND_CACHE   (1 << 10)
+#define NBD_FLAG_SEND_FAST_ZERO   (1 << 11)
+#define NBD_FLAG_BLOCK_STATUS_PAYLOAD (1 << 12)

 /* NBD options (new style handshake only). */
 #define NBD_OPT_EXPORT_NAME1
@@ -204,6 +205,12 @@ struct nbd_request_ext {
   uint64_t count;   /* Request effect or payload length. */
 } NBD_ATTRIBUTE_PACKED;

+/* Extended request payload for NBD_CMD_BLOCK_STATUS, when supported. */
+struct nbd_block_status_payload {
+  uint64_t length;  /* Effective length of client request */
+  /* followed by array of uint32_t ids */
+} NBD_ATTRIBUTE_PACKED;
+
 /* Simple reply (server -> client). */
 struct nbd_simple_reply {
   uint32_t magic;   /* NBD_SIMPLE_REPLY_MAGIC. */
diff --git a/generator/API.ml b/generator/API.ml
index c4d15e3a..bbf7c0bb 100644
--- a/generator/API.ml
+++ b/generator/API.ml
@@ -2279,6 +2279,23 @@   "can_fast_zero", {
 example = Some "examples/server-flags.c";
   };

+  "can_block_status_payload", {
+default_call with
+args = []; ret = RBool;
+permitted_states = [ Negotiating; Connected; Closed ];
+shortdesc = "does the server support the block status payload flag?";
+longdesc = "\
+Returns true if the server supports the use of the
+C flag to allow filtering of the
+block status command.  Returns
+false if the server does not.  Note that this will never return
+true if L is false."
+^ non_blocking_test_call_description;
+see_also = [SectionLink "Flag calls"; Link "opt_info";
+Link "get_extended_headers_negotiated"];
+example = Some "examples/server-flags.c";
+  };
+
   "can_df", {
 default_call with
 args = []; ret = RBool;
@@ -4131,6 +4148,7 @@ let first_version =
   "get_extend

[PATCH v2 07/15] nbd/server: Initial support for extended headers

Even though the NBD spec has been altered to allow us to accept
NBD_CMD_READ larger than the max payload size (provided our response
is a hole or broken up over more than one data chunk), we are not
planning to take advantage of that, and continue to cap NBD_CMD_READ
to 32M regardless of header size.

For NBD_CMD_WRITE_ZEROES and NBD_CMD_TRIM, the block layer already
supports 64-bit operations without any effort on our part.  For
NBD_CMD_BLOCK_STATUS, the client's length is a hint; the easiest
approach for now is to truncate our answer back to 32 bits, which lets
us delay the effort of implementing NBD_REPLY_TYPE_BLOCK_STATUS_EXT to
a separate patch.

Signed-off-by: Eric Blake 
---
 nbd/nbd-internal.h |   7 ++-
 nbd/server.c   | 132 +++--
 2 files changed, 108 insertions(+), 31 deletions(-)

diff --git a/nbd/nbd-internal.h b/nbd/nbd-internal.h
index 0016793ff4..f9fe0b6ce3 100644
--- a/nbd/nbd-internal.h
+++ b/nbd/nbd-internal.h
@@ -1,7 +1,7 @@
 /*
  * NBD Internal Declarations
  *
- * Copyright (C) 2016-2021 Red Hat, Inc.
+ * Copyright (C) 2016-2022 Red Hat, Inc.
  *
  * This work is licensed under the terms of the GNU GPL, version 2 or later.
  * See the COPYING file in the top-level directory.
@@ -35,8 +35,11 @@
  * https://github.com/yoe/nbd/blob/master/doc/proto.md
  */

-/* Size of all NBD_OPT_*, without payload */
+/* Size of all compact NBD_CMD_*, without payload */
 #define NBD_REQUEST_SIZE(4 + 2 + 2 + 8 + 8 + 4)
+/* Size of all extended NBD_CMD_*, without payload */
+#define NBD_EXTENDED_REQUEST_SIZE   (4 + 2 + 2 + 8 + 8 + 8)
+
 /* Size of all NBD_REP_* sent in answer to most NBD_OPT_*, without payload */
 #define NBD_REPLY_SIZE  (4 + 4 + 8)
 /* Size of reply to NBD_OPT_EXPORT_NAME */
diff --git a/nbd/server.c b/nbd/server.c
index 4d1400430b..b46655b4d8 100644
--- a/nbd/server.c
+++ b/nbd/server.c
@@ -141,7 +141,7 @@ struct NBDClient {

 uint32_t check_align; /* If non-zero, check for aligned client requests */

-bool structured_reply;
+bool structured_reply; /* also set true if extended_headers is set */
 bool extended_headers;
 NBDExportMetaContexts export_meta;

@@ -1260,6 +1260,10 @@ static int nbd_negotiate_options(NBDClient *client, 
Error **errp)
 case NBD_OPT_STRUCTURED_REPLY:
 if (length) {
 ret = nbd_reject_length(client, false, errp);
+} else if (client->extended_headers) {
+ret = nbd_negotiate_send_rep_err(
+client, NBD_REP_ERR_EXT_HEADER_REQD, errp,
+"extended headers already negotiated");
 } else if (client->structured_reply) {
 ret = nbd_negotiate_send_rep_err(
 client, NBD_REP_ERR_INVALID, errp,
@@ -1276,6 +1280,19 @@ static int nbd_negotiate_options(NBDClient *client, 
Error **errp)
  errp);
 break;

+case NBD_OPT_EXTENDED_HEADERS:
+if (length) {
+ret = nbd_reject_length(client, false, errp);
+} else if (client->extended_headers) {
+ret = nbd_negotiate_send_rep_err(
+client, NBD_REP_ERR_INVALID, errp,
+"extended headers already negotiated");
+} else {
+ret = nbd_negotiate_send_rep(client, NBD_REP_ACK, errp);
+client->structured_reply = client->extended_headers = true;
+}
+break;
+
 default:
 ret = nbd_opt_drop(client, NBD_REP_ERR_UNSUP, errp,
"Unsupported option %" PRIu32 " (%s)",
@@ -1411,11 +1428,13 @@ nbd_read_eof(NBDClient *client, void *buffer, size_t 
size, Error **errp)
 static int nbd_receive_request(NBDClient *client, NBDRequest *request,
Error **errp)
 {
-uint8_t buf[NBD_REQUEST_SIZE];
-uint32_t magic;
+uint8_t buf[NBD_EXTENDED_REQUEST_SIZE];
+uint32_t magic, expect;
 int ret;
+size_t size = client->extended_headers ? NBD_EXTENDED_REQUEST_SIZE
+: NBD_REQUEST_SIZE;

-ret = nbd_read_eof(client, buf, sizeof(buf), errp);
+ret = nbd_read_eof(client, buf, size, errp);
 if (ret < 0) {
 return ret;
 }
@@ -1423,13 +1442,21 @@ static int nbd_receive_request(NBDClient *client, 
NBDRequest *request,
 return -EIO;
 }

-/* Request
-   [ 0 ..  3]   magic   (NBD_REQUEST_MAGIC)
-   [ 4 ..  5]   flags   (NBD_CMD_FLAG_FUA, ...)
-   [ 6 ..  7]   type(NBD_CMD_READ, ...)
-   [ 8 .. 15]   handle
-   [16 .. 23]   from
-   [24 .. 27]   len
+/*
+ * Compact request
+ *  [ 0 ..  3]   magic   (NBD_REQUEST_MAGIC)
+ *  [ 4 ..  5]   flags   (NBD_CMD_FLAG_FUA, ...)
+ *  [ 6 ..  7]   type(NBD_CMD_READ, ...)
+ *  [ 8 .. 15]   handle

Re: [PATCH v2 15/15] migration: Drop rs->f

Peter Xu  wrote:
> Now with rs->pss we can already cache channels in pss->pss_channels.  That
> pss_channel contains more infromation than rs->f because it's per-channel.
> So rs->f could be replaced by rss->pss[RAM_CHANNEL_PRECOPY].pss_channel,
> while rs->f itself is a bit vague now.
>
> Note that vanilla postcopy still send pages via pss[RAM_CHANNEL_PRECOPY],
> that's slightly confusing but it reflects the reality.
>
> Then, after the replacement we can safely drop rs->f.
>
> Reviewed-by: Dr. David Alan Gilbert 
> Signed-off-by: Peter Xu 

Reviewed-by: Juan Quintela

[libnbd PATCH v2 20/23] interop: Add test of 64-bit block status

Prove that we can round-trip a block status request larger than 4G
through a new-enough qemu-nbd.  Also serves as a unit test of our shim
for converting internal 64-bit representation back to the older 32-bit
nbd_block_status callback interface.
---
 interop/Makefile.am |   6 ++
 interop/large-status.c  | 186 
 interop/large-status.sh |  49 +++
 .gitignore  |   1 +
 4 files changed, 242 insertions(+)
 create mode 100644 interop/large-status.c
 create mode 100755 interop/large-status.sh

diff --git a/interop/Makefile.am b/interop/Makefile.am
index 5e0ea1ed..84c8fbbb 100644
--- a/interop/Makefile.am
+++ b/interop/Makefile.am
@@ -21,6 +21,7 @@ EXTRA_DIST = \
dirty-bitmap.sh \
interop-qemu-storage-daemon.sh \
interop-qemu-block-size.sh \
+   large-status.sh \
list-exports-nbd-config \
list-exports-test-dir/disk1 \
list-exports-test-dir/disk2 \
@@ -134,6 +135,7 @@ check_PROGRAMS += \
list-exports-qemu-nbd \
socket-activation-qemu-nbd \
dirty-bitmap \
+   large-status \
structured-read \
opt-extended-headers \
$(NULL)
@@ -144,6 +146,7 @@ TESTS += \
list-exports-qemu-nbd \
socket-activation-qemu-nbd \
dirty-bitmap.sh \
+   large-status.sh \
structured-read.sh \
interop-qemu-block-size.sh \
opt-extended-headers.sh \
@@ -235,6 +238,9 @@ socket_activation_qemu_nbd_LDADD = 
$(top_builddir)/lib/libnbd.la
 dirty_bitmap_SOURCES = dirty-bitmap.c
 dirty_bitmap_LDADD = $(top_builddir)/lib/libnbd.la

+large_status_SOURCES = large-status.c
+large_status_LDADD = $(top_builddir)/lib/libnbd.la
+
 structured_read_SOURCES = structured-read.c
 structured_read_LDADD = $(top_builddir)/lib/libnbd.la

diff --git a/interop/large-status.c b/interop/large-status.c
new file mode 100644
index ..44a911d7
--- /dev/null
+++ b/interop/large-status.c
@@ -0,0 +1,186 @@
+/* NBD client library in userspace
+ * Copyright (C) 2013-2022 Red Hat Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/* Test 64-bit block status with qemu. */
+
+#include 
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+
+static const char *bitmap;
+
+struct data {
+  bool req_one;/* input: true if req_one was passed to request */
+  int count;   /* input: count of expected remaining calls */
+  bool seen_base;  /* output: true if base:allocation encountered */
+  bool seen_dirty; /* output: true if qemu:dirty-bitmap encountered */
+};
+
+static int
+cb32 (void *opaque, const char *metacontext, uint64_t offset,
+  uint32_t *entries, size_t len, int *error)
+{
+  struct data *data = opaque;
+
+  assert (offset == 0);
+  assert (data->count-- > 0);
+
+  if (strcmp (metacontext, LIBNBD_CONTEXT_BASE_ALLOCATION) == 0) {
+assert (!data->seen_base);
+data->seen_base = true;
+
+/* Data block offset 0 size 64k, remainder is hole */
+assert (len == 4);
+assert (entries[0] == 65536);
+assert (entries[1] == 0);
+/* libnbd had to truncate qemu's >4G answer */
+assert (entries[2] == 4227858432);
+assert (entries[3] == (LIBNBD_STATE_HOLE|LIBNBD_STATE_ZERO));
+  }
+  else if (strcmp (metacontext, bitmap) == 0) {
+assert (!data->seen_dirty);
+data->seen_dirty = true;
+
+/* Dirty block at offset 5G-64k, remainder is clean */
+/* libnbd had to truncate qemu's >4G answer */
+assert (len == 2);
+assert (entries[0] == 4227858432);
+assert (entries[1] == 0);
+  }
+  else {
+fprintf (stderr, "unexpected context %s\n", metacontext);
+exit (EXIT_FAILURE);
+  }
+  return 0;
+}
+
+static int
+cb64 (void *opaque, const char *metacontext, uint64_t offset,
+  nbd_extent *entries, size_t len, int *error)
+{
+  struct data *data = opaque;
+
+  assert (offset == 0);
+  assert (data->count-- > 0);
+
+  if (strcmp (metacontext, LIBNBD_CONTEXT_BASE_ALLOCATION) == 0) {
+assert (!data->seen_base);
+data->seen_base = true;
+
+/* Data block offset 0 size 64k, remainder is hole */
+assert (len == 2);
+assert (entries[0].length == 65536);
+assert (entries[0].flags == 0);
+assert (entries[1].length == 5368643584ULL);
+assert (entr

[PATCH] ccid-card-emulated: fix cast warning

2022-11-14 Thread marcandre . lureau

From: Marc-André Lureau 

../hw/usb/ccid-card-emulated.c: In function 'handle_apdu_thread':
../hw/usb/ccid-card-emulated.c:251:24: error: cast from pointer to integer of 
different size [-Werror=pointer-to-int-cast]
  251 | assert((unsigned long)event > 1000);

Signed-off-by: Marc-André Lureau 
---
 hw/usb/ccid-card-emulated.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/usb/ccid-card-emulated.c b/hw/usb/ccid-card-emulated.c
index ee41a81801..c328660075 100644
--- a/hw/usb/ccid-card-emulated.c
+++ b/hw/usb/ccid-card-emulated.c
@@ -248,7 +248,7 @@ static void *handle_apdu_thread(void* arg)
 WITH_QEMU_LOCK_GUARD(&card->vreader_mutex) {
 while (!QSIMPLEQ_EMPTY(&card->guest_apdu_list)) {
 event = QSIMPLEQ_FIRST(&card->guest_apdu_list);
-assert((unsigned long)event > 1000);
+assert(event != NULL);
 QSIMPLEQ_REMOVE_HEAD(&card->guest_apdu_list, entry);
 if (event->p.data.type != EMUL_GUEST_APDU) {
 DPRINTF(card, 1, "unexpected message in 
handle_apdu_thread\n");
-- 
2.38.1

[libnbd PATCH v2 01/23] block_status: Refactor array storage

For 32-bit block status, we were able to cheat and use an array with
an odd number of elements, with array[0] holding the context id, and
passing &array[1] to the user's callback.  But once we have 64-bit
extents, we can no longer abuse array element 0 like that, for two
reasons: 64-bit extents contain uint64_t which might not be
alignment-compatible with an array of uint32_t on all architectures,
and the new NBD_REPLY_TYPE_BLOCK_STATUS_EXT adds an additional count
field before the array.

Split out a new state STRUCTURED_REPLY.BS_HEADER to receive the
context id (and eventually the new count field for 64-bit replies)
separately from the extents array, and add another structured_reply
type in the payload section for tracking it.  No behavioral change,
other than the rare possibility of landing in the new state.
---
 lib/internal.h  |  1 +
 lib/nbd-protocol.h  | 19 ++
 generator/state_machine.ml  |  9 -
 generator/states-reply-structured.c | 56 -
 4 files changed, 61 insertions(+), 24 deletions(-)

diff --git a/lib/internal.h b/lib/internal.h
index bbbd2639..fe81f1a0 100644
--- a/lib/internal.h
+++ b/lib/internal.h
@@ -241,6 +241,7 @@ struct nbd_handle {
   union {
 struct nbd_structured_reply_offset_data offset_data;
 struct nbd_structured_reply_offset_hole offset_hole;
+struct nbd_structured_reply_block_status_hdr bs_hdr;
 struct {
   struct nbd_structured_reply_error error;
   char msg[NBD_MAX_STRING]; /* Common to all error types */
diff --git a/lib/nbd-protocol.h b/lib/nbd-protocol.h
index e5d6404b..4400d3ab 100644
--- a/lib/nbd-protocol.h
+++ b/lib/nbd-protocol.h
@@ -1,5 +1,5 @@
 /* nbdkit
- * Copyright (C) 2013-2020 Red Hat Inc.
+ * Copyright (C) 2013-2022 Red Hat Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are
@@ -182,12 +182,6 @@ struct nbd_fixed_new_option_reply_meta_context {
   /* followed by a string */
 } NBD_ATTRIBUTE_PACKED;

-/* NBD_REPLY_TYPE_BLOCK_STATUS block descriptor. */
-struct nbd_block_descriptor {
-  uint32_t length;  /* length of block */
-  uint32_t status_flags;/* block type (hole etc) */
-} NBD_ATTRIBUTE_PACKED;
-
 /* Request (client -> server). */
 struct nbd_request {
   uint32_t magic;   /* NBD_REQUEST_MAGIC. */
@@ -224,6 +218,17 @@ struct nbd_structured_reply_offset_hole {
   uint32_t length;  /* Length of hole. */
 } NBD_ATTRIBUTE_PACKED;

+/* NBD_REPLY_TYPE_BLOCK_STATUS block descriptor. */
+struct nbd_block_descriptor {
+  uint32_t length;  /* length of block */
+  uint32_t status_flags;/* block type (hole etc) */
+} NBD_ATTRIBUTE_PACKED;
+
+struct nbd_structured_reply_block_status_hdr {
+  uint32_t context_id;  /* metadata context ID */
+  /* followed by array of nbd_block_descriptor extents */
+} NBD_ATTRIBUTE_PACKED;
+
 struct nbd_structured_reply_error {
   uint32_t error;   /* NBD_E* error number */
   uint16_t len; /* Length of human readable error. */
diff --git a/generator/state_machine.ml b/generator/state_machine.ml
index 257cb4f4..d2c326f3 100644
--- a/generator/state_machine.ml
+++ b/generator/state_machine.ml
@@ -871,10 +871,17 @@ and
 external_events = [];
   };

+  State {
+default_state with
+name = "RECV_BS_HEADER";
+comment = "Receive header of structured reply block-status payload";
+external_events = [];
+  };
+
   State {
 default_state with
 name = "RECV_BS_ENTRIES";
-comment = "Receive a structured reply block-status payload";
+comment = "Receive entries array of structured reply block-status payload";
 external_events = [];
   };

diff --git a/generator/states-reply-structured.c 
b/generator/states-reply-structured.c
index 2456e6da..bbd3de0c 100644
--- a/generator/states-reply-structured.c
+++ b/generator/states-reply-structured.c
@@ -126,19 +126,10 @@  REPLY.STRUCTURED_REPLY.CHECK:
 length < 12 || ((length-4) & 7) != 0)
   goto resync;
 assert (CALLBACK_IS_NOT_NULL (cmd->cb.fn.extent));
-/* We read the context ID followed by all the entries into a
- * single array and deal with it at the end.
- */
-free (h->bs_entries);
-h->bs_entries = malloc (length);
-if (h->bs_entries == NULL) {
-  SET_NEXT_STATE (%.DEAD);
-  set_error (errno, "malloc");
-  break;
-}
-h->rbuf = h->bs_entries;
-h->rlen = length;
-SET_NEXT_STATE (%RECV_BS_ENTRIES);
+/* Start by reading the context ID. */
+h->rbuf = &h->sbuf.sr.payload.bs_hdr;
+h->rlen = sizeof h->sbuf.sr.payload.bs_hdr;
+SET_NEXT_STATE (%RECV_BS_HEADER);
 break;

   default:
@@ -424,9 +415,41 @@  REPLY.STRUCTURED_REPLY.RECV_OFFSET_HOLE:
   }
   return 0;

+ REPLY.STRUCTURED_REPLY.RECV_BS_HEADER:
+  struct command *cmd = h->reply_cmd;
+  uint32_t le

Re: [PATCH] hw/intc: sifive_plic: Renumber the S irqs for numa support

2022-11-14 Thread Frédéric Pétrot


Le 14/11/2022 à 09:40, Philippe Mathieu-Daudé a écrit :

On 11/11/22 13:19, Frédéric Pétrot wrote:

Commit 40244040 changed the way the S irqs are numbered. This breaks when


40244040a7 in case?


  Seems reasonnable, indeed, I'll even align with what git blame shows
  (11 chars, so 40244040a7a).


using numa configuration, e.g.:
./qemu-system-riscv64 -nographic -machine virt,dumpdtb=numa-tree.dtb \
   -m 2G -smp cpus=16 \
  -object memory-backend-ram,id=mem0,size=512M \
  -object memory-backend-ram,id=mem1,size=512M \
  -object memory-backend-ram,id=mem2,size=512M \
  -object memory-backend-ram,id=mem3,size=512M \
  -numa node,cpus=0-3,memdev=mem0,nodeid=0 \
  -numa node,cpus=4-7,memdev=mem1,nodeid=1 \
  -numa node,cpus=8-11,memdev=mem2,nodeid=2 \
  -numa node,cpus=12-15,memdev=mem3,nodeid=3
leads to:
Unexpected error in object_property_find_err() at ../qom/object.c:1304:
qemu-system-riscv64: Property 'riscv.sifive.plic.unnamed-gpio-out[8]' not
found

This patch makes the nubering of the S irqs identical to what it was before.

Signed-off-by: Frédéric Pétrot 
---
  hw/intc/sifive_plic.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/intc/sifive_plic.c b/hw/intc/sifive_plic.c
index c2dfacf028..89d2122742 100644
--- a/hw/intc/sifive_plic.c
+++ b/hw/intc/sifive_plic.c
@@ -480,7 +480,7 @@ DeviceState *sifive_plic_create(hwaddr addr, char 
*hart_config,

    qdev_get_gpio_in(DEVICE(cpu), IRQ_M_EXT));
  }
  if (plic->addr_config[i].mode == PLICMode_S) {
-    qdev_connect_gpio_out(dev, cpu_num,
+    qdev_connect_gpio_out(dev, cpu_num - plic->hartid_base,
    qdev_get_gpio_in(DEVICE(cpu), IRQ_S_EXT));
  }
  }


Oops. >
Eventually we could unify the style:

-- >8 --
@@ -476,11 +476,11 @@ DeviceState *sifive_plic_create(hwaddr addr, char 
*hart_config,

  CPUState *cpu = qemu_get_cpu(cpu_num);

  if (plic->addr_config[i].mode == PLICMode_M) {
-    qdev_connect_gpio_out(dev, num_harts - plic->hartid_base + cpu_num,
+    qdev_connect_gpio_out(dev, cpu_num - hartid_base + num_harts,
    qdev_get_gpio_in(DEVICE(cpu), IRQ_M_EXT));
  }
  if (plic->addr_config[i].mode == PLICMode_S) {
-    qdev_connect_gpio_out(dev, cpu_num,
+    qdev_connect_gpio_out(dev, cpu_num - hartid_base,hartid_base
    qdev_get_gpio_in(DEVICE(cpu), IRQ_S_EXT));
  }
  }
---


  IIUC hartid_base is used to set plic->hartid_base, so agreed, along with the
  style unification.
  I'll send a v2, then.
  Since Alistair already queued the patch, how shall I proceed?


Reviewed-by: Philippe Mathieu-Daudé 



--
+---+
| Frédéric Pétrot,Pr. Grenoble INP-Ensimag/TIMA |
| Mob/Pho: +33 6 74 57 99 65/+33 4 76 57 48 70  Ad augusta  per angusta |
| http://tima.univ-grenoble-alpes.fr frederic.pet...@univ-grenoble-alpes.fr |
+---+

Re: [PATCH 11/13] block: Remove ignore_bds_parents parameter from drain functions


On 08.11.22 13:37, Kevin Wolf wrote:

ignore_bds_parents is now ignored, so we can just remove it.

Signed-off-by: Kevin Wolf 
---
  include/block/block-io.h | 10 ++
  block.c  |  4 +--
  block/io.c   | 78 +++-
  3 files changed, 32 insertions(+), 60 deletions(-)


Reviewed-by: Hanna Reitz

[libnbd PATCH v2 14/23] info: Expose extended-headers support through nbdinfo

Add another bit of overall server information, as well as a '--can
extended-headers' silent query.  For now, the testsuite is written
assuming that when nbdkit finally adds extended headers support, it
will also add a --no-eh kill switch comparable to its existing --no-sr
switch.
---
 info/nbdinfo.pod | 11 ++-
 info/can.c   |  9 +
 info/info-can.sh | 27 +++
 info/info-packets.sh | 17 -
 info/main.c  |  7 ++-
 5 files changed, 68 insertions(+), 3 deletions(-)

diff --git a/info/nbdinfo.pod b/info/nbdinfo.pod
index c47e5175..2455e1c0 100644
--- a/info/nbdinfo.pod
+++ b/info/nbdinfo.pod
@@ -86,6 +86,7 @@ the I<--json> parameter:
"protocol": "newstyle-fixed",
"TLS": false,
"structured": true,
+   "extended": false,
"exports": [
  {
"export-name": "",
@@ -165,6 +166,11 @@ Test if the NBD URI connection is using TLS.
 Test if server can respond with structured replies (a prerequisite
 for supporting block status commands).

+=item nbdinfo --can extended-headers URI
+
+Test if server supports extended headers (a prerequisite for
+supporting 64-bit commands; implies structured replies as well).
+
 =item nbdinfo --is rotational URI

 Test if the server export is backed by something which behaves like a
@@ -312,6 +318,8 @@ Display brief command line help and exit.

 =item B<--can df>

+=item B<--can extended-headers>
+
 =item B<--can fast-zero>

 =item B<--can flush>
@@ -341,7 +349,8 @@ and the following libnbd functions: L,
 L, L, L,
 L, L, L,
 L, L,
-L.
+L,
+L.

 =item B<--color>

diff --git a/info/can.c b/info/can.c
index 08d6bcd5..f602ffce 100644
--- a/info/can.c
+++ b/info/can.c
@@ -50,6 +50,15 @@ do_can (void)
strcasecmp (can, "structured_replies") == 0)
 feature = nbd_get_structured_replies_negotiated (nbd);

+  else if (strcasecmp (can, "eh") == 0 ||
+   strcasecmp (can, "extended header") == 0 ||
+   strcasecmp (can, "extended-header") == 0 ||
+   strcasecmp (can, "extended_header") == 0 ||
+   strcasecmp (can, "extended headers") == 0 ||
+   strcasecmp (can, "extended-headers") == 0 ||
+   strcasecmp (can, "extended_headers") == 0)
+feature = nbd_get_extended_headers_negotiated (nbd);
+
   else if (strcasecmp (can, "readonly") == 0 ||
strcasecmp (can, "read-only") == 0 ||
strcasecmp (can, "read_only") == 0)
diff --git a/info/info-can.sh b/info/info-can.sh
index 3edc3948..e5f6a44b 100755
--- a/info/info-can.sh
+++ b/info/info-can.sh
@@ -61,6 +61,33 @@ esac
 EOF
 test $st = 2

+# --can extended-headers cannot be positively tested until nbdkit gains
+# --no-eh support.  Otherwise, it is similar to --can structured-reply.
+
+no_eh=
+if nbdkit --no-eh --help >/dev/null 2>/dev/null; then
+no_eh=--no-eh
+nbdkit -v -U - sh - \
+   --run '$VG nbdinfo --can extended-headers 
"nbd+unix:///?socket=$unixsocket"' <<'EOF'
+case "$1" in
+  get_size) echo 1024 ;;
+  pread) ;;
+  *) exit 2 ;;
+esac
+EOF
+fi
+
+st=0
+nbdkit -v -U - $no_eh sh - \
+   --run '$VG nbdinfo --can extended-headers 
"nbd+unix:///?socket=$unixsocket"' <<'EOF' || st=$?
+case "$1" in
+  get_size) echo 1024 ;;
+  pread) ;;
+  *) exit 2 ;;
+esac
+EOF
+test $st = 2
+
 # --can cache and --can fua require special handling because in
 # nbdkit-sh-plugin we must print "native" or "none".  Also the can_fua
 # flag is only sent if the export is writable (hence can_write below).
diff --git a/info/info-packets.sh b/info/info-packets.sh
index 82bb526c..a6b307a0 100755
--- a/info/info-packets.sh
+++ b/info/info-packets.sh
@@ -27,12 +27,27 @@ requires nbdkit --no-sr memory --version
 out=info-packets.out
 cleanup_fn rm -f $out

+# Older nbdkit does not support extended headers; --no-eh is a reliable
+# witness of whether nbdkit is new enough.
+
+no_eh=
+if nbdkit --no-eh --help >/dev/null 2>/dev/null; then
+no_eh=--no-eh
+fi
+
 nbdkit --no-sr -U - memory size=1M \
--run '$VG nbdinfo "nbd+unix:///?socket=$unixsocket"' > $out
 cat $out
 grep "protocol: .*using simple packets" $out

-nbdkit -U - memory size=1M \
+nbdkit $no_eh -U - memory size=1M \
--run '$VG nbdinfo "nbd+unix:///?socket=$unixsocket"' > $out
 cat $out
 grep "protocol: .*using structured packets" $out
+
+if test x != "x$no_eh"; then
+nbdkit -U - memory size=1M \
+   --run '$VG nbdinfo "nbd+unix:///?socket=$unixsocket"' > $out
+cat $out
+grep "protocol: .*using extended packets" $out
+fi
diff --git a/info/main.c b/info/main.c
index 5cd91fe1..9794c109 100644
--- a/info/main.c
+++ b/info/main.c
@@ -302,11 +302,13 @@ main (int argc, char *argv[])
 const char *protocol;
 int tls_negotiated;
 int sr_negotiated;
+int eh_negotiated;

 /* Print per-connection fields. */
 protocol = nbd_get_protocol (nbd);
 tls_negotiated = nbd_get_tls_negotiated (nbd);
 sr_negotiated = nbd_get_structured_replies_negotiated (nbd);
+eh_

Re: [PATCH v3] qapi/qom: Memory backend property prealloc-threads doc fix

2022-11-14 Thread Zhenyu Zhang

Thanks all,
I will send v4 patch to fix the 80 characters limitation issue.

On Sat, Nov 12, 2022 at 6:05 AM Gavin Shan  wrote:
>
> On 11/11/22 6:54 PM, Igor Mammedov wrote:
> > On Fri, 11 Nov 2022 17:34:04 +0800
> > Gavin Shan  wrote:
> >> On 11/11/22 5:13 PM, Igor Mammedov wrote:
> >>> On Fri, 11 Nov 2022 07:47:16 +0100
> >>> Markus Armbruster  wrote:
>  Gavin Shan  writes:
> > On 11/11/22 11:05 AM, Zhenyu Zhang wrote:
> >> Commit ffac16fab3 "hostmem: introduce "prealloc-threads" property"
> >> (v5.0.0) changed the default number of threads from number of CPUs
> >> to 1.  This was deemed a regression, and fixed in commit f8d426a685
> >> "hostmem: default the amount of prealloc-threads to smp-cpus".
> >> Except the documentation remained unchanged.  Update it now.
> >> Signed-off-by: Zhenyu Zhang 
> >> ---
> >> v3: Covers historical descriptions  (Markus)
> >> v2: The property is changed to smp-cpus since 5.0   (Phild)
> >> ---
> >> qapi/qom.json | 2 +-
> >> 1 file changed, 1 insertion(+), 1 deletion(-)
> >>
> >
> > With the following comments addressed:
> >
> > Reviewed-by: Gavin Shan 
> >
> > ---
> >
> > Please consider amending the commit log to something like below.
> >
> > The default "prealloc-threads" value is set to 1 when the property is
> > added by commit ffac16fab33b ("hostmem: introduce "prealloc-threads"
> > property") in v5.0.0. The default value is conflicting with the sugar
> > property as the value provided by the sugar property is number of CPUs.
> 
>  What is the sugar property?  Can you explain the conflict in a bit more
>  detail?
> >>>
> >>> my guess is that Gavin means mem_prealloc compat glue in 
> >>> qemu_process_sugar_options()
> >>>
> >>> property value should be set according to following order
> >>>default -> compat -> explicit value
> >>> so I don't see any conflict here.
> >>>
> >>> PS:
> >>> if it we up to me, default would have stayed 1,
> >>> and prealloc-threads fixup to vCPUs number would happen in vl.c
> >>> similar to what is done in qemu_process_sugar_options(),
> >>> keeping backend clean of external dependencies.
> >>>
> >>
> >> Yes, it's the sugar property I was talking about. I'm not sure if
> >> we have a more popular name for this property: compat property or
> >> sugar property.
> >>
> >> When 'mem-prealloc=on' and 'prealloc-threads=xxx' aren't provided,
> >> the value is 1 before commit f8d426a6852c is applied. It's not
> >> inconsistent with 'mem-prealloc=on'. It's the conflict I was talking
> >> about and it's fixed by commit f8d426a6852c
> >
> > default was not supposed to be consistent with legacy mem-prealloc
> > and sugar property takes care of mem-prealloc=on case.
> >
> > so commit message in its current form looks fine to me.
> >
>
> Ok, thanks for your confirm. I think Zhenyu needs to post v4, to fix
> the 80 characters limitation issue. My reviewed-by is still valid.
>
> 
> > The conflict has been fixed by commit f8d426a6852c ("hostmem: default
> > the amount of prealloc-threads to smp-cpus"). However, 'qapi/qom.json'
> > was missed to be updated accordingly in the commit.
> >
> > Update 'qapi/qom.json' to reflect the change in commit f8d426a6852c.
> >
> > Signed-off-by: Zhenyu Zhang 
> >
> > When a specific commit is mentioned in the commit log, we usually have
> > fixed format like below.
> >
> > commit ffac16fab33b ("hostmem: introduce "prealloc-threads" property")
> > commit f8d426a6852c ("hostmem: default the amount of prealloc-threads 
> > to smp-cpus")
> 
>  This is certainly a common format, but the other one is also in use.
> 
> >> diff --git a/qapi/qom.json b/qapi/qom.json
> >> index 30e76653ad..dfd89bc6d4 100644
> >> --- a/qapi/qom.json
> >> +++ b/qapi/qom.json
> >> @@ -576,7 +576,7 @@
> >> #
> >> # @prealloc: if true, preallocate memory (default: false)
> >> #
> >> -# @prealloc-threads: number of CPU threads to use for prealloc 
> >> (default: 1)
> >> +# @prealloc-threads: number of CPU threads to use for prealloc 
> >> (default: number of CPUs) (since 5.0)
> >> #
> >> # @prealloc-context: thread context to use for creation of 
> >> preallocation threads
> >> #(default: none) (since 7.2)
> >>
> >
> > The line seems exceeding 80 characters. It'd better to limit each line 
> > in 75 characters.
> > So you probably need:
> >
> >   # @prealloc-threads: number of CPU threads to use for prealloc 
> > (default: number of CPUs)
> >   #(since 5.0)
> 
>  Still exceeds :)
> 
>  I suggested
> 
>  # @prealloc-threads: number of CPU threads to use for prealloc
>  #(default: number of CPUs) (si

Re: [PATCH v2 1/3] hw/misc/pfsoc: add fabric clocks to ioscb

2022-11-14 Thread Alistair Francis

On Sat, Nov 12, 2022 at 11:36 PM Conor Dooley  wrote:
>
> From: Conor Dooley 
>
> On PolarFire SoC, some peripherals (eg the PCI root port) are clocked by
> "Clock Conditioning Circuitry" in the FPGA. The specific clock depends
> on the FPGA bitstream & can be locked to one particular {D,P}LL - in the
> Icicle Kit Reference Design v2022.09 or later this is/will be the case.
>
> Linux v6.1+ will have a driver for this peripheral and devicetrees that
> previously relied on "fixed-frequency" clock nodes have been switched
> over to clock-controller nodes. The IOSCB region is represented in QEMU,
> but the specific region of it that the CCCs occupy has not so v6.1-rcN
> kernels fail to boot in QEMU.
>
> Add the regions as unimplemented so that the status-quo in terms of boot
> is maintained.
>
> Signed-off-by: Conor Dooley 

Acked-by: Alistair Francis 

Alistair

> ---
>  hw/misc/mchp_pfsoc_ioscb.c | 6 ++
>  include/hw/misc/mchp_pfsoc_ioscb.h | 1 +
>  2 files changed, 7 insertions(+)
>
> diff --git a/hw/misc/mchp_pfsoc_ioscb.c b/hw/misc/mchp_pfsoc_ioscb.c
> index f4fd55a0e5..f976e42f72 100644
> --- a/hw/misc/mchp_pfsoc_ioscb.c
> +++ b/hw/misc/mchp_pfsoc_ioscb.c
> @@ -33,6 +33,7 @@
>   */
>  #define IOSCB_WHOLE_REG_SIZE0x1000
>  #define IOSCB_SUBMOD_REG_SIZE   0x1000
> +#define IOSCB_CCC_REG_SIZE  0x200
>
>  /*
>   * There are many sub-modules in the IOSCB module.
> @@ -45,6 +46,7 @@
>  #define IOSCB_LANE23_BASE   0x0651
>  #define IOSCB_CTRL_BASE 0x0702
>  #define IOSCB_CFG_BASE  0x0708
> +#define IOSCB_CCC_BASE  0x0800
>  #define IOSCB_PLL_MSS_BASE  0x0E001000
>  #define IOSCB_CFM_MSS_BASE  0x0E002000
>  #define IOSCB_PLL_DDR_BASE  0x0E01
> @@ -168,6 +170,10 @@ static void mchp_pfsoc_ioscb_realize(DeviceState *dev, 
> Error **errp)
>"mchp.pfsoc.ioscb.cfg", IOSCB_SUBMOD_REG_SIZE);
>  memory_region_add_subregion(&s->container, IOSCB_CFG_BASE, &s->cfg);
>
> +memory_region_init_io(&s->ccc, OBJECT(s), &mchp_pfsoc_dummy_ops, s,
> +  "mchp.pfsoc.ioscb.ccc", IOSCB_CCC_REG_SIZE);
> +memory_region_add_subregion(&s->container, IOSCB_CCC_BASE, &s->ccc);
> +
>  memory_region_init_io(&s->pll_mss, OBJECT(s), &mchp_pfsoc_pll_ops, s,
>"mchp.pfsoc.ioscb.pll_mss", IOSCB_SUBMOD_REG_SIZE);
>  memory_region_add_subregion(&s->container, IOSCB_PLL_MSS_BASE, 
> &s->pll_mss);
> diff --git a/include/hw/misc/mchp_pfsoc_ioscb.h 
> b/include/hw/misc/mchp_pfsoc_ioscb.h
> index 9235523e33..687b213742 100644
> --- a/include/hw/misc/mchp_pfsoc_ioscb.h
> +++ b/include/hw/misc/mchp_pfsoc_ioscb.h
> @@ -30,6 +30,7 @@ typedef struct MchpPfSoCIoscbState {
>  MemoryRegion lane23;
>  MemoryRegion ctrl;
>  MemoryRegion cfg;
> +MemoryRegion ccc;
>  MemoryRegion pll_mss;
>  MemoryRegion cfm_mss;
>  MemoryRegion pll_ddr;
> --
> 2.37.2
>
>

aarch64 GitLab CI runner down

2022-11-14 Thread Stefan Hajnoczi

Hi Alex and Richard,
The aarch64 GitLab CI runner is down again. Are you able to restart it?

Any idea why it disconnects sometimes?

Thanks,
Stefan

Re: [PATCH 13/13] block: Remove poll parameter from bdrv_parent_drained_begin_single()


On 08.11.22 13:37, Kevin Wolf wrote:

All callers of bdrv_parent_drained_begin_single() pass poll=false now,
so we don't need the parameter any more.

Signed-off-by: Kevin Wolf 
---
  include/block/block-io.h | 5 ++---
  block.c  | 4 ++--
  block/io.c   | 7 ++-
  3 files changed, 6 insertions(+), 10 deletions(-)


Well, “drained_begin” does not mean “drain”, so...

Reviewed-by: Hanna Reitz

[libnbd PATCH v2 15/23] info: Update nbdinfo --map to use 64-bit block status

Although we usually map "base:allocation" which doesn't require the
use of the 64-bit API for flags, this application IS intended to map
out other metacontexts that might have 64-bit flags.  And when
extended headers are in use, we might as well ask for the server to
give us extents as large as it wants, rather than breaking things up
at 4G boundaries.
---
 info/map.c | 67 --
 1 file changed, 35 insertions(+), 32 deletions(-)

diff --git a/info/map.c b/info/map.c
index a5aad955..ffa53b81 100644
--- a/info/map.c
+++ b/info/map.c
@@ -1,5 +1,5 @@
 /* NBD client library in userspace
- * Copyright (C) 2020-2021 Red Hat Inc.
+ * Copyright (C) 2020-2022 Red Hat Inc.
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
@@ -36,13 +36,13 @@

 #include "nbdinfo.h"

-DEFINE_VECTOR_TYPE (uint32_vector, uint32_t)
+DEFINE_VECTOR_TYPE (uint64_vector, uint64_t)

-static void print_extents (uint32_vector *entries);
-static void print_totals (uint32_vector *entries, int64_t size);
+static void print_extents (uint64_vector *entries);
+static void print_totals (uint64_vector *entries, int64_t size);
 static int extent_callback (void *user_data, const char *metacontext,
 uint64_t offset,
-uint32_t *entries, size_t nr_entries,
+nbd_extent *entries, size_t nr_entries,
 int *error);

 void
@@ -50,7 +50,7 @@ do_map (void)
 {
   size_t i;
   int64_t size;
-  uint32_vector entries = empty_vector;
+  uint64_vector entries = empty_vector;
   uint64_t offset, align, max_len;
   size_t prev_entries_size;

@@ -69,14 +69,16 @@ do_map (void)
 fprintf (stderr, "%s: %s\n", progname, nbd_get_error ());
 exit (EXIT_FAILURE);
   }
+  if (nbd_get_extended_headers_negotiated (nbd) == 1)
+max_len = size;

   for (offset = 0; offset < size;) {
 prev_entries_size = entries.len;
-if (nbd_block_status (nbd, MIN (size - offset, max_len), offset,
-  (nbd_extent_callback) {
-.callback = extent_callback,
-.user_data = &entries },
-  0) == -1) {
+if (nbd_block_status_64 (nbd, MIN (size - offset, max_len), offset,
+ (nbd_extent64_callback) {
+   .callback = extent_callback,
+   .user_data = &entries },
+ 0) == -1) {
   fprintf (stderr, "%s: %s\n", progname, nbd_get_error ());
   exit (EXIT_FAILURE);
 }
@@ -99,18 +101,18 @@ do_map (void)
 }

 /* Callback handling --map. */
-static void print_one_extent (uint64_t offset, uint64_t len, uint32_t type);
-static void extent_description (const char *metacontext, uint32_t type,
+static void print_one_extent (uint64_t offset, uint64_t len, uint64_t type);
+static void extent_description (const char *metacontext, uint64_t type,
 char **descr, bool *free_descr,
 const char **fg, const char **bg);

 static int
 extent_callback (void *user_data, const char *metacontext,
  uint64_t offset,
- uint32_t *entries, size_t nr_entries,
+ nbd_extent *entries, size_t nr_entries,
  int *error)
 {
-  uint32_vector *list = user_data;
+  uint64_vector *list = user_data;
   size_t i;

   if (strcmp (metacontext, map) != 0)
@@ -120,7 +122,8 @@ extent_callback (void *user_data, const char *metacontext,
* print_extents below.
*/
   for (i = 0; i < nr_entries; ++i) {
-if (uint32_vector_append (list, entries[i]) == -1) {
+if (uint64_vector_append (list, entries[i].length) == -1 ||
+uint64_vector_append (list, entries[i].flags) == -1) {
   perror ("realloc");
   exit (EXIT_FAILURE);
 }
@@ -129,7 +132,7 @@ extent_callback (void *user_data, const char *metacontext,
 }

 static void
-print_extents (uint32_vector *entries)
+print_extents (uint64_vector *entries)
 {
   size_t i, j;
   uint64_t offset = 0;  /* end of last extent printed + 1 */
@@ -138,7 +141,7 @@ print_extents (uint32_vector *entries)
   if (json_output) fprintf (fp, "[\n");

   for (i = 0; i < entries->len; i += 2) {
-uint32_t type = entries->ptr[last+1];
+uint64_t type = entries->ptr[last+1];

 /* If we're coalescing and the current type is different from the
  * previous one then we should print everything up to this entry.
@@ -157,7 +160,7 @@ print_extents (uint32_vector *entries)

   /* Print the last extent if there is one. */
   if (last != i) {
-uint32_t type = entries->ptr[last+1];
+uint64_t type = entries->ptr[last+1];
 uint64_t len;

 for (j = last, len = 0; j < i; j += 2)
@@ -169,7 +172,7 @@ print_extents (uint32_vector *entries)
 }

 static void
-print_one_extent (uint64_t offset,

Re: [PATCH v6 1/2] Update AVX512 support for xbzrle_encode_buffer

ling xu  wrote:
> This commit updates code of avx512 support for xbzrle_encode_buffer
> function to accelerate xbzrle encoding speed. Runtime check of avx512
> support and benchmark for this feature are added. Compared with C
> version of xbzrle_encode_buffer function, avx512 version can achieve
> 50%-70% performance improvement on benchmarking. In addition, if dirty
> data is randomly located in 4K page, the avx512 version can achieve
> almost 140% performance gain.
>
> Signed-off-by: ling xu 
> Co-authored-by: Zhou Zhao 
> Co-authored-by: Jun Jin 

Reviewed-by: Juan Quintela 

queued.

Re: [PATCH v3 2/2] vhost-net: Fix the virtio features negotiation flaw

2022-11-14 Thread Hyman





在 2022/11/11 3:17, Michael S. Tsirkin 写道:

On Sun, Oct 30, 2022 at 09:52:39PM +0800, huang...@chinatelecom.cn wrote:

From: Hyman Huang(黄勇) 

Save the acked_features once it be configured by guest
virtio driver so it can't miss any features.

Note that this patch also change the features saving logic
in chr_closed_bh, which originally backup features no matter
whether the features are 0 or not, but now do it only if
features aren't 0.

As to reset acked_features to 0 if needed, Qemu always
keeping the backup acked_features up-to-date, and save the
acked_features after virtio_net_set_features in advance,
including reset acked_features to 0, so the behavior is
also covered.

Signed-off-by: Hyman Huang(黄勇) 
Signed-off-by: Guoyi Tu 
---
  hw/net/vhost_net.c  | 9 +
  hw/net/virtio-net.c | 5 +
  include/net/vhost_net.h | 2 ++
  net/vhost-user.c| 6 +-
  4 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c
index d28f8b9..2bffc27 100644
--- a/hw/net/vhost_net.c
+++ b/hw/net/vhost_net.c
@@ -141,6 +141,15 @@ uint64_t vhost_net_get_acked_features(VHostNetState *net)
  return net->dev.acked_features;
  }
  
+void vhost_net_save_acked_features(NetClientState *nc)

+{
+if (nc->info->type != NET_CLIENT_DRIVER_VHOST_USER) {
+return;
+}
+
+vhost_user_save_acked_features(nc, false);
+}
+
  static int vhost_net_get_fd(NetClientState *backend)
  {
  switch (backend->info->type) {
diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index e9f696b..5f8f788 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -924,6 +924,11 @@ static void virtio_net_set_features(VirtIODevice *vdev, 
uint64_t features)
  continue;
  }
  vhost_net_ack_features(get_vhost_net(nc->peer), features);
+/*
+ * keep acked_features in NetVhostUserState up-to-date so it
+ * can't miss any features configured by guest virtio driver.
+ */
+vhost_net_save_acked_features(nc->peer);
  }
  
  if (virtio_has_feature(features, VIRTIO_NET_F_CTRL_VLAN)) {


So when do you want to ack features but *not* save them?
When openvswitch restart and reconnect and Qemu start the vhost_dev， 
acked_features in vhost_dev Qemu need to be initialized and the 
initialized value be fetched from acked_features int NetVhostUserState.

At this time, acked_features  may not be up-to-date but we want it.


Is the effect of this patch, fundamentally, that guest features
from virtio are always copied to vhost-user?
Do we even need an extra copy in vhost user then?

I'm trying to explain this from my view, please point out the mistake 
if i failed. :)


When socket used by vhost-user device disconnectted from openvswitch,
Qemu will stop the vhost-user and clean up the whole struct of 
vhost_dev(include vm's memory region and acked_features), once socket is 
reconnected from openvswitch, Qemu will collect vm's memory region 
dynamically but as to acked_features, IMHO, Qemu can not fetch it from 
guest features of virtio-net, because acked_features are kind of 
different from guest features(bit 30 is different at least)，so Qemu 
need an extra copy.




all this came in with:

commit a463215b087c41d7ca94e51aa347cde523831873
Author: Marc-André Lureau 
Date:   Mon Jun 6 18:45:05 2016 +0200

 vhost-net: save & restore vhost-user acked features

Marc-André do you remember why we have a copy of features in vhost-user
and not just reuse the features from virtio?



diff --git a/include/net/vhost_net.h b/include/net/vhost_net.h
index 387e913..3a5579b 100644
--- a/include/net/vhost_net.h
+++ b/include/net/vhost_net.h
@@ -46,6 +46,8 @@ int vhost_set_vring_enable(NetClientState * nc, int enable);
  
  uint64_t vhost_net_get_acked_features(VHostNetState *net);
  
+void vhost_net_save_acked_features(NetClientState *nc);

+
  int vhost_net_set_mtu(struct vhost_net *net, uint16_t mtu);
  
  #endif

diff --git a/net/vhost-user.c b/net/vhost-user.c
index 74f349c..c512cc9 100644
--- a/net/vhost-user.c
+++ b/net/vhost-user.c
@@ -258,11 +258,7 @@ static void chr_closed_bh(void *opaque)
  s = DO_UPCAST(NetVhostUserState, nc, ncs[0]);
  
  for (i = queues -1; i >= 0; i--) {

-s = DO_UPCAST(NetVhostUserState, nc, ncs[i]);
-
-if (s->vhost_net) {
-s->acked_features = vhost_net_get_acked_features(s->vhost_net);
-}
+vhost_user_save_acked_features(ncs[i], false);
  }
  
  qmp_set_link(name, false, &err);

--
1.8.3.1

[libnbd PATCH v2 05/23] states: Prepare to receive 64-bit replies

Support receiving headers for 64-bit replies if extended headers were
negotiated.  We already insist that the server not send us too much
payload in one reply, so we can exploit that and merge the 64-bit
length back into a normalized 32-bit field for the rest of the payload
length calculations.  The NBD protocol specifically documents that
extended mode takes precedence over structured replies, and that there
are no simple replies in extended mode.  We can also take advantage
that the handle field is in the same offset in all the various reply
types.

Note that if we negotiate extended headers, but a non-compliant server
replies with a non-extended header, this patch will stall waiting for
the server to send more bytes rather than noticing that the magic
number is wrong (for aio operations, you'll get a magic number
mismatch once you send a second command that elicits a reply; but for
blocking operations, we basically deadlock).  The easy alternative
would be to read just the first 4 bytes of magic, then determine how
many more bytes to expect; but that would require more states and
syscalls, and not worth it since the typical server will be compliant.
The other alternative is what the next patch implements: teaching
REPLY.RECV_REPLY to handle short reads that were at least long enough
to transmit magic to specifically look for magic number mismatch.

At this point, h->extended_headers is permanently false (we can't
enable it until all other aspects of the protocol have likewise been
converted).
---
 lib/internal.h  |  1 +
 generator/states-reply-structured.c | 52 +++--
 generator/states-reply.c| 31 +++--
 3 files changed, 57 insertions(+), 27 deletions(-)

diff --git a/lib/internal.h b/lib/internal.h
index e900eca3..73fd24c0 100644
--- a/lib/internal.h
+++ b/lib/internal.h
@@ -242,6 +242,7 @@ struct nbd_handle {
   union {
 struct nbd_simple_reply simple;
 struct nbd_structured_reply structured;
+struct nbd_extended_reply extended;
   } hdr;
   union {
 struct nbd_structured_reply_offset_data offset_data;
diff --git a/generator/states-reply-structured.c 
b/generator/states-reply-structured.c
index 6f187f14..da9894c6 100644
--- a/generator/states-reply-structured.c
+++ b/generator/states-reply-structured.c
@@ -45,14 +45,20 @@ structured_reply_in_bounds (uint64_t offset, uint32_t 
length,

 STATE_MACHINE {
  REPLY.STRUCTURED_REPLY.START:
-  /* We've only read the simple_reply.  The structured_reply is longer,
-   * so read the remaining part.
+  /* If we have extended headers, we've already read the entire header.
+   * Otherwise, we've only read enough for a simple_reply; since structured
+   * replies are longer, read the remaining part.
*/
-  h->rbuf = &h->sbuf;
-  h->rbuf = (char *) h->rbuf + sizeof h->sbuf.reply.hdr.simple;
-  h->rlen = sizeof h->sbuf.reply.hdr.structured;
-  h->rlen -= sizeof h->sbuf.reply.hdr.simple;
-  SET_NEXT_STATE (%RECV_REMAINING);
+  if (h->extended_headers) {
+assert (h->rbuf == sizeof h->sbuf.reply.hdr.extended + (char*) &h->sbuf);
+SET_NEXT_STATE (%CHECK);
+  }
+  else {
+assert (h->rbuf == sizeof h->sbuf.reply.hdr.simple + (char*) &h->sbuf);
+h->rlen = sizeof h->sbuf.reply.hdr.structured -
+  sizeof h->sbuf.reply.hdr.simple;
+SET_NEXT_STATE (%RECV_REMAINING);
+  }
   return 0;

  REPLY.STRUCTURED_REPLY.RECV_REMAINING:
@@ -69,11 +75,18 @@  REPLY.STRUCTURED_REPLY.RECV_REMAINING:
  REPLY.STRUCTURED_REPLY.CHECK:
   struct command *cmd = h->reply_cmd;
   uint16_t flags, type;
-  uint32_t length;
+  uint64_t length;
+  uint64_t offset = -1;

+  assert (cmd);
   flags = be16toh (h->sbuf.reply.hdr.structured.flags);
   type = be16toh (h->sbuf.reply.hdr.structured.type);
-  length = be32toh (h->sbuf.reply.hdr.structured.length);
+  if (h->extended_headers) {
+length = be64toh (h->sbuf.reply.hdr.extended.length);
+offset = be64toh (h->sbuf.reply.hdr.extended.offset);
+  }
+  else
+length = be32toh (h->sbuf.reply.hdr.structured.length);

   /* Reject a server that replies with too much information, but don't
* reject a single structured reply to NBD_CMD_READ on the largest
@@ -83,13 +96,18 @@  REPLY.STRUCTURED_REPLY.CHECK:
* not worth keeping the connection alive.
*/
   if (length > MAX_REQUEST_SIZE + sizeof h->sbuf.reply.payload.offset_data) {
-set_error (0, "invalid server reply length %" PRIu32, length);
+set_error (0, "invalid server reply length %" PRIu64, length);
 SET_NEXT_STATE (%.DEAD);
 return 0;
   }
+  /* For convenience, we now normalize extended replies into compact,
+   * doable since we validated length fits in 32 bits.
+   */
+  h->sbuf.reply.hdr.structured.length = length;

   /* Skip an unexpected structured reply, including to an unknown cookie. */
-  if (cmd == NULL || !h->structured_replies)
+  if (cmd == NULL || !h->structured_replies ||
+  (h->extended_headers && offset != cmd->of

Re: [PATCH v7 12/12] So we use multifd to transmit zero pages.

Leonardo Brás  wrote:
> On Tue, 2022-08-02 at 08:39 +0200, Juan Quintela wrote:
>> Signed-off-by: Juan Quintela 

>> diff --git a/migration/multifd.c b/migration/multifd.c
>> index 89811619d8..54acdc004c 100644
>> --- a/migration/multifd.c
>> +++ b/migration/multifd.c
>> @@ -667,8 +667,8 @@ static void *multifd_send_thread(void *opaque)
>>  {
>>  MultiFDSendParams *p = opaque;
>>  Error *local_err = NULL;
>> -/* qemu older than 7.0 don't understand zero page on multifd channel */
>> -bool use_zero_page = migrate_use_multifd_zero_page();
>> +/* older qemu don't understand zero page on multifd channel */
>> +bool use_multifd_zero_page = !migrate_use_main_zero_page();
>
> I understand that "use_main_zero_page", which is introduced as a new 
> capability,
> is in fact the old behavior, and the new feature is introduced when this
> capability is disabled.
>
> But it sure looks weird reading:
>  use_multifd_zero_page = !migrate_use_main_zero_page();
>
> This series is fresh in my mind, but it took a few seconds to see that this is
> actually not a typo. 

We can't have it both ways.

All other capabilities are false by default.  And libvirt assumes they
are false.  So, or we are willing to change the expectations, or we need
to do it this way.

In previous versions, I had the capability named the other way around,
and I changed it due to this.

Thanks, Juan.

Re: [PATCH v4] qapi/qom: Memory backend property prealloc-threads doc fix

2022-11-14 Thread Philippe Mathieu-Daudé


On 14/11/22 04:24, Zhenyu Zhang wrote:

Commit ffac16fab3 "hostmem: introduce "prealloc-threads" property"
(v5.0.0) changed the default number of threads from number of CPUs
to 1.  This was deemed a regression, and fixed in commit f8d426a685
"hostmem: default the amount of prealloc-threads to smp-cpus".
Except the documentation remained unchanged.

Update 'qapi/qom.json' to reflect the change in commit f8d426a6852c.

Signed-off-by: Zhenyu Zhang 
---

v4: Fix the line exceeding 80 characters limitation issue  (Gavin)
v3: Covers historical descriptions (Markus)
v2: The property is changed to smp-cpus since 5.0  (Phild)

---
  qapi/qom.json | 3 ++-
  1 file changed, 2 insertions(+), 1 deletion(-)


Reviewed-by: Philippe Mathieu-Daudé

Re: [PATCH 10/13] block: Call drain callbacks only once


On 08.11.22 13:37, Kevin Wolf wrote:

We only need to call both the BlockDriver's callback and the parent
callbacks when going from undrained to drained or vice versa. A second
drain section doesn't make a difference for the driver or the parent,
they weren't supposed to send new requests before and after the second
drain.

One thing that gets in the way is the 'ignore_bds_parents' parameter in
bdrv_do_drained_begin_quiesce() and bdrv_do_drained_end(): If it is true
for the first drain, bs->quiesce_counter will be non-zero, but the
parent callbacks still haven't been called, so a second drain where it
is false would still have to call them.

Instead of keeping track of this, let's just get rid of the parameter.
It was introduced in commit 6cd5c9d7b2d as an optimisation so that
during bdrv_drain_all(), we wouldn't recursively drain all parents up to
the root for each node, resulting in quadratic complexity. As it happens,
calling the callbacks only once solves the same problem, so as of this
patch, we'll still have O(n) complexity and ignore_bds_parents is not
needed any more.

This patch only ignores the 'ignore_bds_parents' parameter. It will be
removed in a separate patch.

Signed-off-by: Kevin Wolf 
---
  block.c  | 13 ++---
  block/io.c   | 24 +---
  tests/unit/test-bdrv-drain.c | 16 ++--
  3 files changed, 29 insertions(+), 24 deletions(-)


I too would like parent_quiesce_counter to become `bool 
parent_quiesced`, but:


Reviewed-by: Hanna Reitz

Re: [PATCH v3 1/7] memory: associate DMA accesses with the initiator Device

2022-11-14 Thread Stefan Hajnoczi

The _guarded() calls are required in BHs, timers, fd read/write
callbacks, etc because we're no longer in the memory region dispatch
code with the reentrancy guard set. It's not clear to me whether the
_guarded() calls are actually required in most of these patches
though? Do you plan to convert every DMA API call to a _guarded() call
in the future?

I'm asking because coming up with an API that doesn't require these
code changes will reduce code churn and make existing code safe.

Does it make sense to separate the DMA API and the reentrancy guard
API? That way the reentrancy guard can be put in place once in any BH,
timer, etc callback and then the existing DMA APIs are used within
those callbacks without new _guarded() APIs.

This approach also reduces the number of times that the guard is
toggled. The current approach is fine-grained (per DMA API call) so
the guard needs to be toggled all the time, e.g. in DMA sglist loops.

If we want the compiler to prevent DMA API calls without a reentrancy
guard, then AddressSpace pointers can be hidden behind an API that
sets the guard. This ensures that you cannot access an address space
unless you have a reentrancy guard.

Stefan

Re: [PATCH v3 2/2] vhost-net: Fix the virtio features negotiation flaw

2022-11-14 Thread Michael S. Tsirkin

On Mon, Nov 14, 2022 at 11:35:30PM +0800, Hyman wrote:
> 
> 
> 在 2022/11/11 3:17, Michael S. Tsirkin 写道:
> > On Sun, Oct 30, 2022 at 09:52:39PM +0800, huang...@chinatelecom.cn wrote:
> > > From: Hyman Huang(黄勇) 
> > > 
> > > Save the acked_features once it be configured by guest
> > > virtio driver so it can't miss any features.
> > > 
> > > Note that this patch also change the features saving logic
> > > in chr_closed_bh, which originally backup features no matter
> > > whether the features are 0 or not, but now do it only if
> > > features aren't 0.
> > > 
> > > As to reset acked_features to 0 if needed, Qemu always
> > > keeping the backup acked_features up-to-date, and save the
> > > acked_features after virtio_net_set_features in advance,
> > > including reset acked_features to 0, so the behavior is
> > > also covered.
> > > 
> > > Signed-off-by: Hyman Huang(黄勇) 
> > > Signed-off-by: Guoyi Tu 
> > > ---
> > >   hw/net/vhost_net.c  | 9 +
> > >   hw/net/virtio-net.c | 5 +
> > >   include/net/vhost_net.h | 2 ++
> > >   net/vhost-user.c| 6 +-
> > >   4 files changed, 17 insertions(+), 5 deletions(-)
> > > 
> > > diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c
> > > index d28f8b9..2bffc27 100644
> > > --- a/hw/net/vhost_net.c
> > > +++ b/hw/net/vhost_net.c
> > > @@ -141,6 +141,15 @@ uint64_t vhost_net_get_acked_features(VHostNetState 
> > > *net)
> > >   return net->dev.acked_features;
> > >   }
> > > +void vhost_net_save_acked_features(NetClientState *nc)
> > > +{
> > > +if (nc->info->type != NET_CLIENT_DRIVER_VHOST_USER) {
> > > +return;
> > > +}
> > > +
> > > +vhost_user_save_acked_features(nc, false);
> > > +}
> > > +
> > >   static int vhost_net_get_fd(NetClientState *backend)
> > >   {
> > >   switch (backend->info->type) {
> > > diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
> > > index e9f696b..5f8f788 100644
> > > --- a/hw/net/virtio-net.c
> > > +++ b/hw/net/virtio-net.c
> > > @@ -924,6 +924,11 @@ static void virtio_net_set_features(VirtIODevice 
> > > *vdev, uint64_t features)
> > >   continue;
> > >   }
> > >   vhost_net_ack_features(get_vhost_net(nc->peer), features);
> > > +/*
> > > + * keep acked_features in NetVhostUserState up-to-date so it
> > > + * can't miss any features configured by guest virtio driver.
> > > + */
> > > +vhost_net_save_acked_features(nc->peer);
> > >   }
> > >   if (virtio_has_feature(features, VIRTIO_NET_F_CTRL_VLAN)) {
> > 
> > So when do you want to ack features but *not* save them?
> When openvswitch restart and reconnect and Qemu start the vhost_dev，
> acked_features in vhost_dev Qemu need to be initialized and the initialized
> value be fetched from acked_features int NetVhostUserState.
> At this time, acked_features  may not be up-to-date but we want it.
> > 
> > Is the effect of this patch, fundamentally, that guest features
> > from virtio are always copied to vhost-user?
> > Do we even need an extra copy in vhost user then?
> > 
> I'm trying to explain this from my view, please point out the mistake if i
> failed. :)
> 
> When socket used by vhost-user device disconnectted from openvswitch,
> Qemu will stop the vhost-user and clean up the whole struct of
> vhost_dev(include vm's memory region and acked_features), once socket is
> reconnected from openvswitch, Qemu will collect vm's memory region
> dynamically but as to acked_features, IMHO, Qemu can not fetch it from guest
> features of virtio-net, because acked_features are kind of different from
> guest features(bit 30 is different at least)，so Qemu need an extra copy.

Well we already have a list of valid frontend features in
user_feature_bits.



> > 
> > all this came in with:
> > 
> > commit a463215b087c41d7ca94e51aa347cde523831873
> > Author: Marc-André Lureau 
> > Date:   Mon Jun 6 18:45:05 2016 +0200
> > 
> >  vhost-net: save & restore vhost-user acked features
> > 
> > Marc-André do you remember why we have a copy of features in vhost-user
> > and not just reuse the features from virtio?
> > 
> > 
> > > diff --git a/include/net/vhost_net.h b/include/net/vhost_net.h
> > > index 387e913..3a5579b 100644
> > > --- a/include/net/vhost_net.h
> > > +++ b/include/net/vhost_net.h
> > > @@ -46,6 +46,8 @@ int vhost_set_vring_enable(NetClientState * nc, int 
> > > enable);
> > >   uint64_t vhost_net_get_acked_features(VHostNetState *net);
> > > +void vhost_net_save_acked_features(NetClientState *nc);
> > > +
> > >   int vhost_net_set_mtu(struct vhost_net *net, uint16_t mtu);
> > >   #endif
> > > diff --git a/net/vhost-user.c b/net/vhost-user.c
> > > index 74f349c..c512cc9 100644
> > > --- a/net/vhost-user.c
> > > +++ b/net/vhost-user.c
> > > @@ -258,11 +258,7 @@ static void chr_closed_bh(void *opaque)
> > >   s = DO_UPCAST(NetVhostUserState, nc, ncs[0]);
> > >   for (i = queues -1; i >= 0; i--) {
> > > -s = DO_UPCAST(NetVhostUserState, nc, ncs[i

Re: [PATCH v2 03/15] migration: Cleanup xbzrle zero page cache update logic

Peter Xu  wrote:
> The major change is to replace "!save_page_use_compression()" with
> "xbzrle_enabled" to make it clear.
>
> Reasonings:
>
> (1) When compression enabled, "!save_page_use_compression()" is exactly the
> same as checking "xbzrle_enabled".
>
> (2) When compression disabled, "!save_page_use_compression()" always return
> true.  We used to try calling the xbzrle code, but after this change we
> won't, and we shouldn't need to.
>
> Since at it, drop the xbzrle_enabled check in xbzrle_cache_zero_page()
> because with this change it's not needed anymore.
>
> Reviewed-by: Dr. David Alan Gilbert 
> Signed-off-by: Peter Xu 

Reviewed-by: Juan Quintela

Re: [PATCH 09/13] block: Remove subtree drains


On 08.11.22 13:37, Kevin Wolf wrote:

Subtree drains are not used any more. Remove them.

After this, BdrvChildClass.attach/detach() don't poll any more.

Signed-off-by: Kevin Wolf 
---
  include/block/block-io.h |  18 +--
  include/block/block_int-common.h |   1 -
  include/block/block_int-io.h |  12 --
  block.c  |  20 +--
  block/io.c   | 121 +++---
  tests/unit/test-bdrv-drain.c | 261 ++-
  6 files changed, 44 insertions(+), 389 deletions(-)


Reviewed-by: Hanna Reitz

Re: [PATCH v2 4/5] migration: Use non-atomic ops for clear log bitmap

Peter Xu  wrote:
> Since we already have bitmap_mutex to protect either the dirty bitmap or
> the clear log bitmap, we don't need atomic operations to set/clear/test on
> the clear log bitmap.  Switching all ops from atomic to non-atomic
> versions, meanwhile touch up the comments to show which lock is in charge.
>
> Introduced non-atomic version of bitmap_test_and_clear_atomic(), mostly the
> same as the atomic version but simplified a few places, e.g. dropped the
> "old_bits" variable, and also the explicit memory barriers.
>
> Reviewed-by: Dr. David Alan Gilbert 
> Signed-off-by: Peter Xu 

Reviewed-by: Juan Quintela

Re: [PATCH v4] qapi/qom: Memory backend property prealloc-threads doc fix

2022-11-14 Thread Gavin Shan


On 11/14/22 11:24 AM, Zhenyu Zhang wrote:

Commit ffac16fab3 "hostmem: introduce "prealloc-threads" property"
(v5.0.0) changed the default number of threads from number of CPUs
to 1.  This was deemed a regression, and fixed in commit f8d426a685
"hostmem: default the amount of prealloc-threads to smp-cpus".
Except the documentation remained unchanged.

Update 'qapi/qom.json' to reflect the change in commit f8d426a6852c.

Signed-off-by: Zhenyu Zhang 
---

v4: Fix the line exceeding 80 characters limitation issue  (Gavin)
v3: Covers historical descriptions (Markus)
v2: The property is changed to smp-cpus since 5.0  (Phild)

---
  qapi/qom.json | 3 ++-
  1 file changed, 2 insertions(+), 1 deletion(-)



Reviewed-by: Gavin Shan 


diff --git a/qapi/qom.json b/qapi/qom.json
index 30e76653ad..f4a7917f3d 100644
--- a/qapi/qom.json
+++ b/qapi/qom.json
@@ -576,7 +576,8 @@
  #
  # @prealloc: if true, preallocate memory (default: false)
  #
-# @prealloc-threads: number of CPU threads to use for prealloc (default: 1)
+# @prealloc-threads: number of CPU threads to use for prealloc
+#(default: number of CPUs) (since 5.0)
  #
  # @prealloc-context: thread context to use for creation of preallocation 
threads
  #(default: none) (since 7.2)

[PATCH v2 09/15] nbd/client: Initial support for extended headers

Update the client code to be able to send an extended request, and
parse an extended header from the server.  Note that since we reject
any structured reply with a too-large payload, we can always normalize
a valid header back into the compact form, so that the caller need not
deal with two branches of a union.  Still, until a later patch lets
the client negotiate extended headers, the code added here should not
be reached.  Note that because of the different magic numbers, it is
just as easy to trace and then tolerate a non-compliant server sending
the wrong header reply as it would be to insist that the server is
compliant.

The only caller to nbd_receive_reply() always passed NULL for errp;
since we are changing the signature anyways, I decided to sink the
decision to ignore errors one layer lower.

Signed-off-by: Eric Blake 
---
 include/block/nbd.h |  2 +-
 block/nbd.c |  3 +-
 nbd/client.c| 84 ++---
 nbd/trace-events|  1 +
 4 files changed, 61 insertions(+), 29 deletions(-)

diff --git a/include/block/nbd.h b/include/block/nbd.h
index 357121ce76..02e31b2261 100644
--- a/include/block/nbd.h
+++ b/include/block/nbd.h
@@ -363,7 +363,7 @@ int nbd_init(int fd, QIOChannelSocket *sioc, NBDExportInfo 
*info,
  Error **errp);
 int nbd_send_request(QIOChannel *ioc, NBDRequest *request, bool ext_hdr);
 int coroutine_fn nbd_receive_reply(BlockDriverState *bs, QIOChannel *ioc,
-   NBDReply *reply, Error **errp);
+   NBDReply *reply, bool ext_hdrs);
 int nbd_client(int fd);
 int nbd_disconnect(int fd);
 int nbd_errno_to_system_errno(int err);
diff --git a/block/nbd.c b/block/nbd.c
index 32681d2867..a8b1bc1054 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -457,7 +457,8 @@ static coroutine_fn int nbd_receive_replies(BDRVNBDState 
*s, uint64_t handle)

 /* We are under mutex and handle is 0. We have to do the dirty work. */
 assert(s->reply.handle == 0);
-ret = nbd_receive_reply(s->bs, s->ioc, &s->reply, NULL);
+ret = nbd_receive_reply(s->bs, s->ioc, &s->reply,
+s->info.extended_headers);
 if (ret <= 0) {
 ret = ret ? ret : -EIO;
 nbd_channel_error(s, ret);
diff --git a/nbd/client.c b/nbd/client.c
index 2480a48ec6..70f06ce637 100644
--- a/nbd/client.c
+++ b/nbd/client.c
@@ -1348,22 +1348,28 @@ int nbd_disconnect(int fd)

 int nbd_send_request(QIOChannel *ioc, NBDRequest *request, bool ext_hdr)
 {
-uint8_t buf[NBD_REQUEST_SIZE];
+uint8_t buf[NBD_EXTENDED_REQUEST_SIZE];
+size_t len;

-assert(!ext_hdr);
-assert(request->len <= UINT32_MAX);
 trace_nbd_send_request(request->from, request->len, request->handle,
request->flags, request->type,
nbd_cmd_lookup(request->type));

-stl_be_p(buf, NBD_REQUEST_MAGIC);
+stl_be_p(buf, ext_hdr ? NBD_EXTENDED_REQUEST_MAGIC : NBD_REQUEST_MAGIC);
 stw_be_p(buf + 4, request->flags);
 stw_be_p(buf + 6, request->type);
 stq_be_p(buf + 8, request->handle);
 stq_be_p(buf + 16, request->from);
-stl_be_p(buf + 24, request->len);
+if (ext_hdr) {
+stq_be_p(buf + 24, request->len);
+len = NBD_EXTENDED_REQUEST_SIZE;
+} else {
+assert(request->len <= UINT32_MAX);
+stl_be_p(buf + 24, request->len);
+len = NBD_REQUEST_SIZE;
+}

-return nbd_write(ioc, buf, sizeof(buf), NULL);
+return nbd_write(ioc, buf, len, NULL);
 }

 /* nbd_receive_simple_reply
@@ -1392,28 +1398,34 @@ static int nbd_receive_simple_reply(QIOChannel *ioc, 
NBDSimpleReply *reply,

 /* nbd_receive_structured_reply_chunk
  * Read structured reply chunk except magic field (which should be already
- * read).
+ * read).  Normalize into the compact form.
  * Payload is not read.
  */
-static int nbd_receive_structured_reply_chunk(QIOChannel *ioc,
-  NBDStructuredReplyChunk *chunk,
+static int nbd_receive_structured_reply_chunk(QIOChannel *ioc, NBDReply *chunk,
   Error **errp)
 {
 int ret;
+size_t len;
+uint64_t payload_len;

-assert(chunk->magic == NBD_STRUCTURED_REPLY_MAGIC);
+if (chunk->magic == NBD_STRUCTURED_REPLY_MAGIC) {
+len = sizeof(chunk->structured);
+} else {
+assert(chunk->magic == NBD_EXTENDED_REPLY_MAGIC);
+len = sizeof(chunk->extended);
+}

 ret = nbd_read(ioc, (uint8_t *)chunk + sizeof(chunk->magic),
-   sizeof(*chunk) - sizeof(chunk->magic), "structured chunk",
+   len - sizeof(chunk->magic), "structured chunk",
errp);
 if (ret < 0) {
 return ret;
 }

-chunk->flags = be16_to_cpu(chunk->flags);
-chunk->type = be16_to_cpu(chunk->type);
-chunk->handle = be64_to_cpu(chunk->handle);
-chunk->length = be32_to_cpu(ch

Re: [PATCH v4 4/7] target/arm: Enable TTBCR_EAE for ARMv8-R AArch32

On Sun, 23 Oct 2022 at 16:37,  wrote:
>
> From: Tobias Röhmel 
>
> ARMv8-R AArch32 CPUs behave as if TTBCR.EAE is always 1 even
> tough they don't have the TTBCR register.
> See ARM Architecture Reference Manual Supplement - ARMv8, for the ARMv8-R
> AArch32 architecture profile Version:A.c section C1.2.
>
> Signed-off-by: Tobias Röhmel 
> ---
>  target/arm/debug_helper.c | 3 +++
>  target/arm/internals.h| 4 
>  target/arm/tlb_helper.c   | 3 +++
>  3 files changed, 10 insertions(+)
>
> diff --git a/target/arm/debug_helper.c b/target/arm/debug_helper.c
> index c21739242c..73665f988b 100644
> --- a/target/arm/debug_helper.c
> +++ b/target/arm/debug_helper.c
> @@ -437,6 +437,9 @@ static uint32_t arm_debug_exception_fsr(CPUARMState *env)
>
>  if (target_el == 2 || arm_el_is_aa64(env, target_el)) {
>  using_lpae = true;
> +} else if (arm_feature(env, ARM_FEATURE_PMSA)
> +&& arm_feature(env, ARM_FEATURE_V8)) {

Indentation looks wrong here. Generally the second line of a
multiline if (...) condition starts in the column after the '(',
so it lines up with the first part of the condition.

> +using_lpae = true;
>  } else {
>  if (arm_feature(env, ARM_FEATURE_LPAE) &&
>  (env->cp15.tcr_el[target_el] & TTBCR_EAE)) {

For instance this is an example in the existing code.

We are inconsistent about whether we put operators like '&&' at
the end of the first line or beginning of the second line, so
pick whichever you like best, I guess.

> diff --git a/target/arm/internals.h b/target/arm/internals.h
> index 307a596505..e3699421b0 100644
> --- a/target/arm/internals.h
> +++ b/target/arm/internals.h
> @@ -253,6 +253,10 @@ unsigned int arm_pamax(ARMCPU *cpu);
>  static inline bool extended_addresses_enabled(CPUARMState *env)
>  {
>  uint64_t tcr = env->cp15.tcr_el[arm_is_secure(env) ? 3 : 1];
> +if (arm_feature(env, ARM_FEATURE_PMSA)
> + && arm_feature(env, ARM_FEATURE_V8)) {

Indentation is a bit odd here too.

> +return true;
> +}
>  return arm_el_is_aa64(env, 1) ||
> (arm_feature(env, ARM_FEATURE_LPAE) && (tcr & TTBCR_EAE));
>  }
> diff --git a/target/arm/tlb_helper.c b/target/arm/tlb_helper.c
> index ad225b1cb2..a2047b0bc6 100644
> --- a/target/arm/tlb_helper.c
> +++ b/target/arm/tlb_helper.c
> @@ -18,6 +18,9 @@ bool regime_using_lpae_format(CPUARMState *env, ARMMMUIdx 
> mmu_idx)
>  int el = regime_el(env, mmu_idx);
>  if (el == 2 || arm_el_is_aa64(env, el)) {
>  return true;
> +} else if (arm_feature(env, ARM_FEATURE_PMSA)
> +&& arm_feature(env, ARM_FEATURE_V8)) {
> +return true;
>  }

Use an "if ()..." not an "else if" here to match the style
of the other check in this function.

>  if (arm_feature(env, ARM_FEATURE_LPAE)
>  && (regime_tcr(env, mmu_idx) & TTBCR_EAE)) {

Otherwise
Reviewed-by: Peter Maydell 

thanks
-- PMM

Re: [PATCH v3 01/15] hw: encode accessing CPU index in MemTxAttrs

On Fri, 11 Nov 2022 at 13:23, Philippe Mathieu-Daudé  wrote:
>
> On 31/10/22 14:03, Peter Maydell wrote:
> > On Mon, 31 Oct 2022 at 12:08, Philippe Mathieu-Daudé  
> > wrote:
> >>
> >> On 4/10/22 16:54, Peter Maydell wrote:
> >>> On Tue, 4 Oct 2022 at 14:33, Alex Bennée  wrote:
> 
> 
>  Peter Maydell  writes:
> > The MSC is in the address map like most other stuff, and thus there is
> > no restriction on whether it can be accessed by other things than CPUs
> > (DMAing to it would be silly but is perfectly possible).
> >
> > The intent of the code is "pass this transaction through, but force
> > it to be Secure/NonSecure regardless of what it was before". That
> > should not involve a change of the requester type.
> 
>  Should we assert (or warn) when the requester_type is unspecified?
> >>>
> >>> Not in the design of MemTxAttrs that's currently in git, no:
> >>> in that design it's perfectly fine for something generating
> >>> memory transactions to use MEMTXATTRS_UNSPECIFIED (which defaults
> >>> to meaning a bunch of things including "not secure").
> >>
> >> In tz_mpc_handle_block():
> >>
> >> static MemTxResult tz_mpc_handle_block(TZMPC *s, hwaddr addr, MemTxAttrs
> >> attrs)
> >> {
> >>   /* Handle a blocked transaction: raise IRQ, capture info, etc */
> >>   if (!s->int_stat) {
> >>
> >>   s->int_info1 = addr;
> >>   s->int_info2 = 0;
> >>   s->int_info2 = FIELD_DP32(s->int_info2, INT_INFO2, HMASTER,
> >> attrs.requester_id & 0x);
> >>   s->int_info2 = FIELD_DP32(s->int_info2, INT_INFO2, HNONSEC,
> >> ~attrs.secure);
> >>   s->int_info2 = FIELD_DP32(s->int_info2, INT_INFO2, CFG_NS,
> >> tz_mpc_cfg_ns(s, addr));
> >>
> >>
> >> Should we check whether the requester is MTRT_CPU?
> >
> > That code is basically assuming that the requester_id is the AMBA AHB
> > 'HMASTER' field (i.e. something hopefully unique to all things that
> > send out transactions, not necessarily limited only to CPUs), which is a
> > somewhat bogus assumption given that it isn't currently any such thing...
> >
> > I'm not sure if/how this patchset plans to model generic "ID of transaction
> > generator".
>
> Does your 'generic "ID of transaction generator"' fit into MTRT_MACHINE
> described as "for more complex encoding":
>
>'MACHINE indicates a machine specific encoding which needs further
> processing to decode into its constituent parts.'
>
> ?

Not really, because "generic ID of a transaction generator" is a
superset of "ID per CPU", not something separate that sits
alongside it...

-- PMM

Re: [PATCH] hw/intc: sifive_plic: Renumber the S irqs for numa support

2022-11-14 Thread Philippe Mathieu-Daudé


On 14/11/22 11:34, Frédéric Pétrot wrote:

Le 14/11/2022 à 09:40, Philippe Mathieu-Daudé a écrit :

On 11/11/22 13:19, Frédéric Pétrot wrote:




Eventually we could unify the style:

-- >8 --
@@ -476,11 +476,11 @@ DeviceState *sifive_plic_create(hwaddr addr, 
char *hart_config,

  CPUState *cpu = qemu_get_cpu(cpu_num);

  if (plic->addr_config[i].mode == PLICMode_M) {
-    qdev_connect_gpio_out(dev, num_harts - plic->hartid_base 
+ cpu_num,
+    qdev_connect_gpio_out(dev, cpu_num - hartid_base + 
num_harts,
    qdev_get_gpio_in(DEVICE(cpu), 
IRQ_M_EXT));

  }
  if (plic->addr_config[i].mode == PLICMode_S) {
-    qdev_connect_gpio_out(dev, cpu_num,
+    qdev_connect_gpio_out(dev, cpu_num - hartid_base,hartid_base
    qdev_get_gpio_in(DEVICE(cpu), 
IRQ_S_EXT));

  }
  }
---


   IIUC hartid_base is used to set plic->hartid_base, so agreed, along 
with the

   style unification.
   I'll send a v2, then.
   Since Alistair already queued the patch, how shall I proceed?


I didn't notice Alistair queued (he usually send a notification by
responding "queued" to the patches). If it is queued, then too late
(and not a big deal) -- you can still post the v2 and let him pick
it :)

Re: [PATCH] hw/sd: Fix sun4i allwinner-sdhost for U-Boot

On Sat, 12 Nov 2022 at 21:49, Strahinja Jankovic
 wrote:
>
> Trying to run U-Boot for Cubieboard (Allwinner A10) fails because it cannot
> access SD card. The problem is that FIFO register in current
> allwinner-sdhost implementation is at the address corresponding to
> Allwinner H3, but not A10.
> Linux kernel is not affected since Linux driver uses DMA access and does
> not use FIFO register for reading/writing.
>
> This patch adds new class parameter `is_sun4i` and based on that
> parameter uses register at offset 0x100 either as FIFO register (if
> sun4i) or as threshold register (if not sun4i; in this case register at
> 0x200 is FIFO register).
>
> Tested with U-Boot and Linux kernel image built for Cubieboard and
> OrangePi PC.
>
> Signed-off-by: Strahinja Jankovic 
> ---
>  hw/sd/allwinner-sdhost.c | 67 ++--
>  include/hw/sd/allwinner-sdhost.h |  1 +
>  2 files changed, 47 insertions(+), 21 deletions(-)
>
> diff --git a/hw/sd/allwinner-sdhost.c b/hw/sd/allwinner-sdhost.c
> index 455d6eabf6..51e5e90830 100644
> --- a/hw/sd/allwinner-sdhost.c
> +++ b/hw/sd/allwinner-sdhost.c
> @@ -65,7 +65,7 @@ enum {
>  REG_SD_DLBA   = 0x84,  /* Descriptor List Base Address */
>  REG_SD_IDST   = 0x88,  /* Internal DMA Controller Status */
>  REG_SD_IDIE   = 0x8C,  /* Internal DMA Controller IRQ Enable */
> -REG_SD_THLDC  = 0x100, /* Card Threshold Control */
> +REG_SD_THLDC  = 0x100, /* Card Threshold Control / FIFO (sun4i 
> only)*/
>  REG_SD_DSBD   = 0x10C, /* eMMC DDR Start Bit Detection Control */
>  REG_SD_RES_CRC= 0x110, /* Response CRC from card/eMMC */
>  REG_SD_DATA7_CRC  = 0x114, /* CRC Data 7 from card/eMMC */
> @@ -415,10 +415,29 @@ static void allwinner_sdhost_dma(AwSdHostState *s)
>  }
>  }
>
> +static uint32_t allwinner_sdhost_fifo_read(AwSdHostState *s)
> +{
> +uint32_t res = 0;
> +
> +if (sdbus_data_ready(&s->sdbus)) {
> +sdbus_read_data(&s->sdbus, &res, sizeof(uint32_t));
> +le32_to_cpus(&res);
> +allwinner_sdhost_update_transfer_cnt(s, sizeof(uint32_t));
> +allwinner_sdhost_auto_stop(s);
> +allwinner_sdhost_update_irq(s);
> +} else {
> +qemu_log_mask(LOG_GUEST_ERROR, "%s: no data ready on SD bus\n",
> +  __func__);
> +}
> +
> +return res;
> +}
> +
>  static uint64_t allwinner_sdhost_read(void *opaque, hwaddr offset,
>unsigned size)
>  {
>  AwSdHostState *s = AW_SDHOST(opaque);
> +AwSdHostClass *sc = AW_SDHOST_GET_CLASS(s);
>  uint32_t res = 0;
>
>  switch (offset) {
> @@ -508,8 +527,12 @@ static uint64_t allwinner_sdhost_read(void *opaque, 
> hwaddr offset,
>  case REG_SD_IDIE:  /* Internal DMA Controller Interrupt Enable */
>  res = s->dmac_irq;
>  break;
> -case REG_SD_THLDC: /* Card Threshold Control */
> -res = s->card_threshold;
> +case REG_SD_THLDC: /* Card Threshold Control or FIFO register 
> (sun4i) */
> +if (sc->is_sun4i) {
> +res = allwinner_sdhost_fifo_read(s);
> +} else {
> +res = s->card_threshold;
> +}
>  break;
>  case REG_SD_DSBD:  /* eMMC DDR Start Bit Detection Control */
>  res = s->startbit_detect;
> @@ -531,16 +554,7 @@ static uint64_t allwinner_sdhost_read(void *opaque, 
> hwaddr offset,
>  res = s->status_crc;
>  break;
>  case REG_SD_FIFO:  /* Read/Write FIFO */
> -if (sdbus_data_ready(&s->sdbus)) {
> -sdbus_read_data(&s->sdbus, &res, sizeof(uint32_t));
> -le32_to_cpus(&res);
> -allwinner_sdhost_update_transfer_cnt(s, sizeof(uint32_t));
> -allwinner_sdhost_auto_stop(s);
> -allwinner_sdhost_update_irq(s);
> -} else {
> -qemu_log_mask(LOG_GUEST_ERROR, "%s: no data ready on SD bus\n",
> -  __func__);
> -}
> +res = allwinner_sdhost_fifo_read(s);

Does the sun4i really have the FIFO at both addresses, or should
this one do something else for sun4i ?

thanks
-- PMM

[PULL 2/2] target/i386: hardcode R_EAX as destination register for LAHF/SAHF

2022-11-14 Thread Richard Henderson

From: Paolo Bonzini 

When translating code that is using LAHF and SAHF in combination with the
REX prefix, the instructions should not use any other register than AH;
however, QEMU selects SPL (SP being register 4, just like AH) if the
REX prefix is present.  To fix this, use deposit directly without
going through gen_op_mov_v_reg and gen_op_mov_reg_v.

Resolves: https://gitlab.com/qemu-project/qemu/-/issues/130
Signed-off-by: Paolo Bonzini 
Signed-off-by: Richard Henderson 
---
 target/i386/tcg/translate.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index dbd6492778..7e0b2a709a 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -5230,7 +5230,7 @@ static bool disas_insn(DisasContext *s, CPUState *cpu)
 case 0x9e: /* sahf */
 if (CODE64(s) && !(s->cpuid_ext3_features & CPUID_EXT3_LAHF_LM))
 goto illegal_op;
-gen_op_mov_v_reg(s, MO_8, s->T0, R_AH);
+tcg_gen_shri_tl(s->T0, cpu_regs[R_EAX], 8);
 gen_compute_eflags(s);
 tcg_gen_andi_tl(cpu_cc_src, cpu_cc_src, CC_O);
 tcg_gen_andi_tl(s->T0, s->T0, CC_S | CC_Z | CC_A | CC_P | CC_C);
@@ -5242,7 +5242,7 @@ static bool disas_insn(DisasContext *s, CPUState *cpu)
 gen_compute_eflags(s);
 /* Note: gen_compute_eflags() only gives the condition codes */
 tcg_gen_ori_tl(s->T0, cpu_cc_src, 0x02);
-gen_op_mov_reg_v(s, MO_8, R_AH, s->T0);
+tcg_gen_deposit_tl(cpu_regs[R_EAX], cpu_regs[R_EAX], s->T0, 8, 8);
 break;
 case 0xf5: /* cmc */
 gen_compute_eflags(s);
-- 
2.34.1

Re: KVM call for 2022-11-15

2022-11-14 Thread Alex Williamson

On Mon, 14 Nov 2022 12:47:40 +0100
quint...@redhat.com wrote:

> Hi
> 
> Please, send any topic that you are interested in covering.
> 
> We already have some topics:
> Re agenda, see below topics our team would like to discuss:
> 
>- QEMU support for kernel/vfio V2 live migration patches
>- acceptance of changes required for Grace/Hopper passthrough and vGPU
>support
>   - the migration support is now looking like it will converge on the
>   6.2 kernel
>- tuning GPU migration performance on QEMU/vfio, beyond what the V2 work
>delivers
> 
> 
>  Call details:
> 
> By popular demand, a google calendar public entry with it
> 
>   
> https://www.google.com/calendar/embed?src=dG9iMXRqcXAzN3Y4ZXZwNzRoMHE4a3BqcXNAZ3JvdXAuY2FsZW5kYXIuZ29vZ2xlLmNvbQ
> 
> (Let me know if you have any problems with the calendar entry.  I just
> gave up about getting right at the same time CEST, CET, EDT and DST).

This url doesn't work for me, but the one embedded in your previous
call for agenda[1] does.  Thanks,

Alex

[1]https://lore.kernel.org/all/87tu3nvrzo.fsf@secure.mitica/

Re: [PULL 0/1] Fix loongarch make check-tcg failure

2022-11-14 Thread Stefan Hajnoczi

Applied, thanks.

Please update the changelog at https://wiki.qemu.org/ChangeLog/7.2 for any 
user-visible changes.


signature.asc
Description: PGP signature

[PATCH v2 1/6] spec: Recommend cap on NBD_REPLY_TYPE_BLOCK_STATUS length

The spec was silent on how many extents a server could reply with.
However, both qemu and nbdkit (the two server implementations known to
have implemented the NBD_CMD_BLOCK_STATUS extension) implement a hard
cap, and will truncate the amount of extents in a reply to avoid
sending a client a reply so large that the client would treat it as a
denial of service attack.  Clients currently have no way during
negotiation to request such a limit of the server, so it is easier to
just document this as a restriction on viable server implementations
than to add yet another round of handshaking.  Also, mentioning
amplification effects is worthwhile.

When qemu first implemented NBD_CMD_BLOCK_STATUS for the
base:allocation context (qemu commit e7b1948d51, Mar 2018), it behaved
as if NBD_CMD_FLAG_REQ_ONE were always passed by the client, and never
responded with more than one extent.  Later, when adding its
qemu:dirty-bitmap:XYZ context extension (qemu commit 3d068aff16, Jun
2018), it added a cap to 128k extents (1M+4 bytes), and that cap was
applied to base:allocation once qemu started sending multiple extents
for that context as well (qemu commit fb7afc797e, Jul 2018).  Qemu
extents are never smaller than 512 bytes (other than an exception at
the end of a file whose size is not aligned to 512), but even so, a
request for just under 4G of block status could produce 8M extents,
resulting in a reply of 64M if it were not capped smaller.

When nbdkit first implemented NBD_CMD_BLOCK_STATUS (nbdkit 4ca66f70a5,
Mar 2019), it did not impose any restriction on the number of extents
in the reply chunk.  But because it allows extents as small as one
byte, it is easy to write a server that can amplify a client's request
of status over 1M of the image into a reply over 8M in size, and it
was very easy to demonstrate that a hard cap was needed to avoid
crashing clients or otherwise killing the connection (a bad server
impacting the client negatively).  So nbdkit enforced a bound of 1M
extents (8M+4 bytes, nbdkit commit 6e0dc839ea, Jun 2019).  [Unrelated
to this patch, but worth noting for history: nbdkit's situation also
has to deal with the fact that it is designed for plugin server
implementations; and not capping the number of extents in a reply also
posed a problem to nbdkit as the server, where a plugin could exhaust
memory and kill the server, unrelated to any size constraints enforced
by a client.]

Since the limit chosen by these two implementations is different, and
since nbdkit has versions that were not limited, add this as a SHOULD
NOT instead of MUST NOT constraint on servers implementing block
status.  It does not matter that qemu picked a smaller limit that it
truncates to, since we have already documented that the server may
truncate for other reasons (such as it being inefficient to collect
that many extents in the first place).  But documenting the limit now
becomes even more important in the face of a future addition of 64-bit
requests, where a client's request is no longer bounded to 4G and
could thereby produce even more than 8M extents for the corner case
when every 512 bytes is a new extent, if it were not for this
recommendation.

---
v2: Add wording about amplification effect
---
 doc/proto.md | 51 +++
 1 file changed, 31 insertions(+), 20 deletions(-)

diff --git a/doc/proto.md b/doc/proto.md
index 3a96703..8f08583 100644
--- a/doc/proto.md
+++ b/doc/proto.md
@@ -1818,6 +1818,12 @@ MUST initiate a hard disconnect.
   the different contexts need not have the same number of extents or
   cumulative extent length.

+  Servers SHOULD NOT send more than 2^20 extents in a single reply
+  chunk; in other words, the size of
+  `NBD_REPLY_TYPE_BLOCK_STATUS` should not be more than 4 + 8*2^20
+  (8,388,612 bytes), even if this requires that the server truncate
+  the response in relation to the *length* requested by the client.
+
   Even if the client did not use the `NBD_CMD_FLAG_REQ_ONE` flag in
   its request, the server MAY return fewer descriptors in the reply
   than would be required to fully specify the whole range of requested
@@ -2180,26 +2186,31 @@ The following request types exist:
 `NBD_REPLY_TYPE_BLOCK_STATUS` chunk represent consecutive portions
 of the file starting from specified *offset*.  If the client used
 the `NBD_CMD_FLAG_REQ_ONE` flag, each chunk contains exactly one
-descriptor where the *length* of the descriptor MUST NOT be greater
-than the *length* of the request; otherwise, a chunk MAY contain
-multiple descriptors, and the final descriptor MAY extend beyond
-the original requested size if the server can determine a larger
-length without additional effort.  On the other hand, the server MAY
-return less data than requested. However the server MUST return at
-least one status descriptor (and since each status descriptor has
-a non-zero length, a client can always make progress on a
-successf

Re: [PATCH] hw/sd: Fix sun4i allwinner-sdhost for U-Boot

2022-11-14 Thread Strahinja Jankovic

Hi,

Thank you for your reply.

On Mon, Nov 14, 2022 at 4:42 PM Peter Maydell  wrote:
>
> On Sat, 12 Nov 2022 at 21:49, Strahinja Jankovic
>  wrote:
> >
> > Trying to run U-Boot for Cubieboard (Allwinner A10) fails because it cannot
> > access SD card. The problem is that FIFO register in current
> > allwinner-sdhost implementation is at the address corresponding to
> > Allwinner H3, but not A10.
> > Linux kernel is not affected since Linux driver uses DMA access and does
> > not use FIFO register for reading/writing.
> >
> > This patch adds new class parameter `is_sun4i` and based on that
> > parameter uses register at offset 0x100 either as FIFO register (if
> > sun4i) or as threshold register (if not sun4i; in this case register at
> > 0x200 is FIFO register).
> >
> > Tested with U-Boot and Linux kernel image built for Cubieboard and
> > OrangePi PC.
> >
> > Signed-off-by: Strahinja Jankovic 
> > ---
> >  hw/sd/allwinner-sdhost.c | 67 ++--
> >  include/hw/sd/allwinner-sdhost.h |  1 +
> >  2 files changed, 47 insertions(+), 21 deletions(-)
> >
> > diff --git a/hw/sd/allwinner-sdhost.c b/hw/sd/allwinner-sdhost.c
> > index 455d6eabf6..51e5e90830 100644
> > --- a/hw/sd/allwinner-sdhost.c
> > +++ b/hw/sd/allwinner-sdhost.c
> > @@ -65,7 +65,7 @@ enum {
> >  REG_SD_DLBA   = 0x84,  /* Descriptor List Base Address */
> >  REG_SD_IDST   = 0x88,  /* Internal DMA Controller Status */
> >  REG_SD_IDIE   = 0x8C,  /* Internal DMA Controller IRQ Enable */
> > -REG_SD_THLDC  = 0x100, /* Card Threshold Control */
> > +REG_SD_THLDC  = 0x100, /* Card Threshold Control / FIFO (sun4i 
> > only)*/
> >  REG_SD_DSBD   = 0x10C, /* eMMC DDR Start Bit Detection Control */
> >  REG_SD_RES_CRC= 0x110, /* Response CRC from card/eMMC */
> >  REG_SD_DATA7_CRC  = 0x114, /* CRC Data 7 from card/eMMC */
> > @@ -415,10 +415,29 @@ static void allwinner_sdhost_dma(AwSdHostState *s)
> >  }
> >  }
> >
> > +static uint32_t allwinner_sdhost_fifo_read(AwSdHostState *s)
> > +{
> > +uint32_t res = 0;
> > +
> > +if (sdbus_data_ready(&s->sdbus)) {
> > +sdbus_read_data(&s->sdbus, &res, sizeof(uint32_t));
> > +le32_to_cpus(&res);
> > +allwinner_sdhost_update_transfer_cnt(s, sizeof(uint32_t));
> > +allwinner_sdhost_auto_stop(s);
> > +allwinner_sdhost_update_irq(s);
> > +} else {
> > +qemu_log_mask(LOG_GUEST_ERROR, "%s: no data ready on SD bus\n",
> > +  __func__);
> > +}
> > +
> > +return res;
> > +}
> > +
> >  static uint64_t allwinner_sdhost_read(void *opaque, hwaddr offset,
> >unsigned size)
> >  {
> >  AwSdHostState *s = AW_SDHOST(opaque);
> > +AwSdHostClass *sc = AW_SDHOST_GET_CLASS(s);
> >  uint32_t res = 0;
> >
> >  switch (offset) {
> > @@ -508,8 +527,12 @@ static uint64_t allwinner_sdhost_read(void *opaque, 
> > hwaddr offset,
> >  case REG_SD_IDIE:  /* Internal DMA Controller Interrupt Enable */
> >  res = s->dmac_irq;
> >  break;
> > -case REG_SD_THLDC: /* Card Threshold Control */
> > -res = s->card_threshold;
> > +case REG_SD_THLDC: /* Card Threshold Control or FIFO register 
> > (sun4i) */
> > +if (sc->is_sun4i) {
> > +res = allwinner_sdhost_fifo_read(s);
> > +} else {
> > +res = s->card_threshold;
> > +}
> >  break;
> >  case REG_SD_DSBD:  /* eMMC DDR Start Bit Detection Control */
> >  res = s->startbit_detect;
> > @@ -531,16 +554,7 @@ static uint64_t allwinner_sdhost_read(void *opaque, 
> > hwaddr offset,
> >  res = s->status_crc;
> >  break;
> >  case REG_SD_FIFO:  /* Read/Write FIFO */
> > -if (sdbus_data_ready(&s->sdbus)) {
> > -sdbus_read_data(&s->sdbus, &res, sizeof(uint32_t));
> > -le32_to_cpus(&res);
> > -allwinner_sdhost_update_transfer_cnt(s, sizeof(uint32_t));
> > -allwinner_sdhost_auto_stop(s);
> > -allwinner_sdhost_update_irq(s);
> > -} else {
> > -qemu_log_mask(LOG_GUEST_ERROR, "%s: no data ready on SD bus\n",
> > -  __func__);
> > -}
> > +res = allwinner_sdhost_fifo_read(s);
>
> Does the sun4i really have the FIFO at both addresses, or should
> this one do something else for sun4i ?

The sun4i sdhost actually has no registers with offset higher than
0x100 (offset of REG_SD_THLDC in patch), so REG_SD_DSBD, all
REG_SD_*_CRC, REG_SD_CRC_STA and REG_SD_FIFO@0x200 should not be
accessed from application code meant to run on sun4i. That is why I
only changed the FIFO/THLDC (offset 0x100) register behavior, since
that change makes U-Boot work.

I could update the patch so all of these registers with offset bigger
than 0x100 log error if sun4i is selected, so that is more clear.
Would that be ok?

Best regards,
Strahinja


>

Re: [PATCH v4 3/7] target/arm: Make stage_2_format for cache attributes optional

On Sun, 23 Oct 2022 at 16:37,  wrote:
>
> From: Tobias Röhmel 
>
> The v8R PMSAv8 has a two-stage MPU translation process, but, unlike
> VMSAv8, the stage 2 attributes are in the same format as the stage 1
> attributes (8-bit MAIR format). Rather than converting the MAIR
> format to the format used for VMSA stage 2 (bits [5:2] of a VMSA
> stage 2 descriptor) and then converting back to do the attribute
> combination, allow combined_attrs_nofwb() to accept s2 attributes
> that are already in the MAIR format.
>
> We move the assert() to combined_attrs_fwb(), because that function
> really does require a VMSA stage 2 attribute format. (We will never
> get there for v8R, because PMSAv8 does not implement FEAT_S2FWB.)
>
> Signed-off-by: Tobias Röhmel 
> ---
>  target/arm/ptw.c | 9 +++--
>  1 file changed, 7 insertions(+), 2 deletions(-)
>
> diff --git a/target/arm/ptw.c b/target/arm/ptw.c
> index 2ddfc028ab..db50715fa7 100644
> --- a/target/arm/ptw.c
> +++ b/target/arm/ptw.c
> @@ -2105,7 +2105,11 @@ static uint8_t combined_attrs_nofwb(CPUARMState *env,
>  {
>  uint8_t s1lo, s2lo, s1hi, s2hi, s2_mair_attrs, ret_attrs;
>
> -s2_mair_attrs = convert_stage2_attrs(env, s2.attrs);
> +if (s2.is_s2_format) {
> +s2_mair_attrs = convert_stage2_attrs(env, s2.attrs);
> +} else {
> +s2_mair_attrs = s2.attrs;
> +}

You'll find when you next rebase that this needs adjustment,
because after a recent refactoring, this function no longer gets
passed the env but instead is passed the effective HCR_EL2 value.

>
>  s1lo = extract32(s1.attrs, 0, 4);
>  s2lo = extract32(s2_mair_attrs, 0, 4);
> @@ -2163,6 +2167,8 @@ static uint8_t force_cacheattr_nibble_wb(uint8_t attr)
>  static uint8_t combined_attrs_fwb(CPUARMState *env,
>ARMCacheAttrs s1, ARMCacheAttrs s2)
>  {
> +assert(s2.is_s2_format && !s1.is_s2_format);
> +
>  switch (s2.attrs) {
>  case 7:
>  /* Use stage 1 attributes */
> @@ -2212,7 +2218,6 @@ static ARMCacheAttrs combine_cacheattrs(CPUARMState 
> *env,
>  ARMCacheAttrs ret;
>  bool tagged = false;
>
> -assert(s2.is_s2_format && !s1.is_s2_format);

I think we should still assert(!s1.is_s2_format) here.

>  ret.is_s2_format = false;
>
>  if (s1.attrs == 0xf0) {
> --
> 2.34.1

Otherwise
Reviewed-by: Peter Maydell 

thanks
-- PMM

Re: [PATCH v1 5/9] hw/virtio: introduce virtio_device_should_start

2022-11-14 Thread Christian Borntraeger


Am 08.11.22 um 10:23 schrieb Alex Bennée:

The previous fix to virtio_device_started revealed a problem in its
use by both the core and the device code. The core code should be able
to handle the device "starting" while the VM isn't running to handle
the restoration of migration state. To solve this dual use introduce a
new helper for use by the vhost-user backends who all use it to feed a
should_start variable.

We can also pick up a change vhost_user_blk_set_status while we are at
it which follows the same pattern.

Fixes: 9f6bcfd99f (hw/virtio: move vm_running check to virtio_device_started)
Fixes: 27ba7b027f (hw/virtio: add boilerplate for vhost-user-gpio device)
Signed-off-by: Alex Bennée 
Cc: "Michael S. Tsirkin" 


Hmmm, is this
commit 259d69c00b67c02a67f3bdbeeea71c2c0af76c35
Author: Alex Bennée 
AuthorDate: Mon Nov 7 12:14:07 2022 +
Commit: Michael S. Tsirkin 
CommitDate: Mon Nov 7 14:08:18 2022 -0500

hw/virtio: introduce virtio_device_should_start

and older version?

This does not seem to fix the regression that I have reported.

Re: [PATCH v9 5/5] docs: Add generic vhost-vdpa device documentation

2022-11-14 Thread Jason Wang

On Sat, Nov 12, 2022 at 10:40 PM Longpeng(Mike)  wrote:
>
> From: Longpeng 
>
> Signed-off-by: Longpeng 
> ---
>  .../devices/vhost-vdpa-generic-device.rst | 46 +++
>  1 file changed, 46 insertions(+)
>  create mode 100644 docs/system/devices/vhost-vdpa-generic-device.rst
>
> diff --git a/docs/system/devices/vhost-vdpa-generic-device.rst 
> b/docs/system/devices/vhost-vdpa-generic-device.rst
> new file mode 100644
> index 00..d6db9af755
> --- /dev/null
> +++ b/docs/system/devices/vhost-vdpa-generic-device.rst
> @@ -0,0 +1,46 @@
> +
> +=
> +vhost-vDPA generic device
> +=
> +
> +This document explains the usage of the vhost-vDPA generic device.
> +
> +Description
> +---
> +
> +vDPA(virtio data path acceleration) device is a device that uses a datapath
> +which complies with the virtio specifications with vendor specific control
> +path.
> +
> +QEMU provides two types of vhost-vDPA devices to enable the vDPA device, one
> +is type sensitive which means QEMU needs to know the actual device type
> +(e.g. net, blk, scsi) and another is called "vhost-vDPA generic device" which
> +is type insensitive.
> +
> +The vhost-vDPA generic device builds on the vhost-vdpa subsystem and virtio
> +subsystem. It is quite small, but it can support any type of virtio device.
> +
> +Examples
> +
> +
> +Prepare the vhost-vDPA backends first:

Nit: here we'd better first say, it needs some vendor specific steps
to provision vDPA and bind it to vhost-vDPA driver. Then we can see
that in under dev directory.

Thanks

> +
> +::
> +  host# ls -l /dev/vhost-vdpa-*
> +  crw--- 1 root root 236, 0 Nov  2 00:49 /dev/vhost-vdpa-0
> +
> +Start QEMU with virtio-mmio bus:
> +
> +::
> +  host# qemu-system  \
> +  -M microvm -m 512 -smp 2 -kernel ... -initrd ...   \
> +  -device vhost-vdpa-device,vhostdev=/dev/vhost-vdpa-0   \
> +  ...
> +
> +Start QEMU with virtio-pci bus:
> +
> +::
> +  host# qemu-system  \
> +  -M pc -m 512 -smp 2\
> +  -device vhost-vdpa-device-pci,vhostdev=/dev/vhost-vdpa-0   \
> +  ...
> --
> 2.23.0
>

[PATCH v2 0/1] tcg: add perfmap and jitdump

2022-11-14 Thread Ilya Leoshkevich

v1:
https://lists.nongnu.org/archive/html/qemu-devel/2022-10/msg01824.html
https://lists.nongnu.org/archive/html/qemu-devel/2022-11/msg01073.html

v1 -> v2:
* Use QEMU_LOCK_GUARD (Alex).
* Handle TARGET_TB_PCREL (Alex).
* Support ELF -kernels, add a note about this (Alex). Tested with
  qemu-system-x86_64 and Linux kernel - it's not fast, but it works.
* Minor const correctness and style improvements.

Ilya Leoshkevich (1):
  tcg: add perfmap and jitdump

 accel/tcg/debuginfo.c |  99 +++
 accel/tcg/debuginfo.h |  52 ++
 accel/tcg/meson.build |   2 +
 accel/tcg/perf.c  | 334 ++
 accel/tcg/perf.h  |  28 
 accel/tcg/translate-all.c |   8 +
 docs/devel/tcg.rst|  23 +++
 hw/core/loader.c  |   5 +
 include/tcg/tcg.h |   4 +-
 linux-user/elfload.c  |   3 +
 linux-user/exit.c |   2 +
 linux-user/main.c |  15 ++
 linux-user/meson.build|   1 +
 meson.build   |   8 +
 qemu-options.hx   |  20 +++
 softmmu/vl.c  |  11 ++
 tcg/region.c  |   2 +-
 tcg/tcg.c |   2 +
 18 files changed, 616 insertions(+), 3 deletions(-)
 create mode 100644 accel/tcg/debuginfo.c
 create mode 100644 accel/tcg/debuginfo.h
 create mode 100644 accel/tcg/perf.c
 create mode 100644 accel/tcg/perf.h

-- 
2.37.2

[libnbd PATCH v2 18/23] generator: Actually request extended headers

This is the culmination of the previous patches' preparation work for
using extended headers when possible.  The new states in the state
machine are copied extensively from our handling of
OPT_STRUCTURED_REPLY.  The next patch will then expose a new API
nbd_opt_extended_headers() for manual control.

At the same time I posted this patch, I had patches for qemu-nbd to
support extended headers as server (nbdkit is a bit tougher).  The
next patches will add some interop tests that pass when using a new
enough qemu-nbd, showing that we have cross-project interoperability
and therefore an extension worth standardizing.
---
 generator/API.ml  | 87 -
 generator/Makefile.am |  3 +-
 generator/state_machine.ml| 41 
 .../states-newstyle-opt-extended-headers.c| 94 +++
 generator/states-newstyle-opt-starttls.c  |  6 +-
 5 files changed, 185 insertions(+), 46 deletions(-)
 create mode 100644 generator/states-newstyle-opt-extended-headers.c

diff --git a/generator/API.ml b/generator/API.ml
index 3d0289f6..570f15f4 100644
--- a/generator/API.ml
+++ b/generator/API.ml
@@ -953,23 +953,24 @@   "set_request_meta_context", {
 (all C calls when L is false,
 or L and L when option mode is
 enabled) will also try to issue NBD_OPT_SET_META_CONTEXT when
-the server supports structured replies and any contexts were
-registered by L.  The default setting
-is true; however the extra step of negotiating meta contexts is
-not always desirable: performing both info and go on the same
-export works without needing to re-negotiate contexts on the
-second call; integration testing of other servers may benefit
-from manual invocation of L at
-other times in the negotiation sequence; and even when using just
-L, it can be faster to collect the server's
+the server supports structured replies or extended headers and
+any contexts were registered by L.  The
+default setting is true; however the extra step of negotiating
+meta contexts is not always desirable: performing both info and
+go on the same export works without needing to re-negotiate
+contexts on the second call; integration testing of other servers
+may benefit from manual invocation of L
+at other times in the negotiation sequence; and even when using
+just L, it can be faster to collect the server's
 results by relying on the callback function passed to
 L than a series of post-process
 calls to L.

 Note that this control has no effect if the server does not
-negotiate structured replies, or if the client did not request
-any contexts via L.  Setting this
-control to false may cause L to fail.";
+negotiate structured replies or extended headers, or if the
+client did not request any contexts via L.
+Setting this control to false may cause L
+to fail.";
 see_also = [Link "set_opt_mode"; Link "opt_go"; Link "opt_info";
 Link "opt_list_meta_context"; Link "opt_set_meta_context";
 Link "get_structured_replies_negotiated";
@@ -1404,11 +1405,11 @@   "opt_info", {
 If successful, functions like L and
 L will report details about that export.  If
 L is set (the default) and
-structured replies were negotiated, it is also valid to use
-L after this call.  However, it may be
-more efficient to clear that setting and manually utilize
-L with its callback approach, for
-learning which contexts an export supports.  In general, if
+structured replies or extended headers were negotiated, it is also
+valid to use L after this call.  However,
+it may be more efficient to clear that setting and manually
+utilize L with its callback approach,
+for learning which contexts an export supports.  In general, if
 L is called next, that call will likely succeed
 with the details remaining the same, although this is not
 guaranteed by all servers.
@@ -1538,12 +1539,12 @@   "opt_set_meta_context", {
 recent L or L.
 This can only be used if L enabled option
 mode.  Normally, this function is redundant, as L
-automatically does the same task if structured replies have
-already been negotiated.  But manual control over meta context
-requests can be useful for fine-grained testing of how a server
-handles unusual negotiation sequences.  Often, use of this
-function is coupled with L to
-bypass the automatic context request normally performed by
+automatically does the same task if structured replies or extended
+headers have already been negotiated.  But manual control over
+meta context requests can be useful for fine-grained testing of
+how a server handles unusual negotiation sequences.  Often, use
+of this function is coupled with L
+to bypass the automatic context request normally performed by
 L.

 The NBD protocol allows a client to decide how many queries to ask
@@ -1597,12 +1598,13 @@   "opt_set_meta_context_queries", {
 or L.  This can only be used if
 L enabled option mode.  Normally, this
 function is redundant, as L automatically does
-the same

Re: [PATCH v9 0/8] KVM: mm: fd-based approach for supporting KVM

2022-11-14 Thread Alex Bennée



Chao Peng  writes:


> Introduction
> 
> KVM userspace being able to crash the host is horrible. Under current
> KVM architecture, all guest memory is inherently accessible from KVM
> userspace and is exposed to the mentioned crash issue. The goal of this
> series is to provide a solution to align mm and KVM, on a userspace
> inaccessible approach of exposing guest memory. 
>
> Normally, KVM populates secondary page table (e.g. EPT) by using a host
> virtual address (hva) from core mm page table (e.g. x86 userspace page
> table). This requires guest memory being mmaped into KVM userspace, but
> this is also the source where the mentioned crash issue can happen. In
> theory, apart from those 'shared' memory for device emulation etc, guest
> memory doesn't have to be mmaped into KVM userspace.
>
> This series introduces fd-based guest memory which will not be mmaped
> into KVM userspace. KVM populates secondary page table by using a
> fd/offset pair backed by a memory file system. The fd can be created
> from a supported memory filesystem like tmpfs/hugetlbfs and KVM can
> directly interact with them with newly introduced in-kernel interface,
> therefore remove the KVM userspace from the path of accessing/mmaping
> the guest memory. 
>
> Kirill had a patch [2] to address the same issue in a different way. It
> tracks guest encrypted memory at the 'struct page' level and relies on
> HWPOISON to reject the userspace access. The patch has been discussed in
> several online and offline threads and resulted in a design document [3]
> which is also the original proposal for this series. Later this patch
> series evolved as more comments received in community but the major
> concepts in [3] still hold true so recommend reading.
>
> The patch series may also be useful for other usages, for example, pure
> software approach may use it to harden itself against unintentional
> access to guest memory. This series is designed with these usages in
> mind but doesn't have code directly support them and extension might be
> needed.

There are a couple of additional use cases where having a consistent
memory interface with the kernel would be useful.

  - Xen DomU guests providing other domains with VirtIO backends

  Xen by default doesn't give other domains special access to a domains
  memory. The guest can grant access to regions of its memory to other
  domains for this purpose. 

  - pKVM on ARM

  Similar to Xen, pKVM moves the management of the page tables into the
  hypervisor and again doesn't allow those domains to share memory by
  default.

  - VirtIO loopback

  This allows for VirtIO devices for the host kernel to be serviced by
  backends running in userspace. Obviously the memory userspace is
  allowed to access is strictly limited to the buffers and queues
  because giving userspace unrestricted access to the host kernel would
  have consequences.

All of these VirtIO backends work with vhost-user which uses memfds to
pass references to guest memory from the VMM to the backend
implementation.

> mm change
> =
> Introduces a new memfd_restricted system call which can create memory
> file that is restricted from userspace access via normal MMU operations
> like read(), write() or mmap() etc and the only way to use it is
> passing it to a third kernel module like KVM and relying on it to
> access the fd through the newly added restrictedmem kernel interface.
> The restrictedmem interface bridges the memory file subsystems
> (tmpfs/hugetlbfs etc) and their users (KVM in this case) and provides
> bi-directional communication between them. 
>
>
> KVM change
> ==
> Extends the KVM memslot to provide guest private (encrypted) memory from
> a fd. With this extension, a single memslot can maintain both private
> memory through private fd (restricted_fd/restricted_offset) and shared
> (unencrypted) memory through userspace mmaped host virtual address
> (userspace_addr). For a particular guest page, the corresponding page in
> KVM memslot can be only either private or shared and only one of the
> shared/private parts of the memslot is visible to guest. For how this
> new extension is used in QEMU, please refer to kvm_set_phys_mem() in
> below TDX-enabled QEMU repo.
>
> Introduces new KVM_EXIT_MEMORY_FAULT exit to allow userspace to get the
> chance on decision-making for shared <-> private memory conversion. The
> exit can be an implicit conversion in KVM page fault handler or an
> explicit conversion from guest OS.
>
> Extends existing SEV ioctls KVM_MEMORY_ENCRYPT_{UN,}REG_REGION to
> convert a guest page between private <-> shared. The data maintained in
> these ioctls tells the truth whether a guest page is private or shared
> and this information will be used in KVM page fault handler to decide
> whether the private or the shared part of the memslot is visible to
> guest.
>


-- 
Alex Bennée

[libnbd PATCH v2 07/23] generator: Add struct nbd_extent in prep for 64-bit extents

The existing nbd_block_status() callback is permanently stuck with an
array of uint32_t pairs (len/2 extents), which is both constrained on
maximum extent size (no larger than 4G) and on the status flags (must
fit in 32 bits).  While the "base:allocation" metacontext will never
exceed 32 bits, it is not hard to envision other extension
metacontexts where a 64-bit status would be useful (for example, Zoned
Block Devices expressing a 64-bit offset[1]).  Exposing 64-bit extents
will require a new API; we now have the decision of whether to copy
the existing API practice of returning a bare array containing len/2
extent pairs, or with a saner idea of an array of structs with len
extents.  Returning an array of structs is actually easier to map to
various language bindings, particularly since we know the length field
can fit in 63-bits (fallout from the fact that NBD exports are capped
in size by signed off_t), but where the status field must represent a
full 64 bits (assuming that the user wants to connect to a metadata
extension that utilizes 64 bits, rather than existing metadata
contexts that only expose 32 bits).

This patch introduces the struct we plan to use in the new API, along
with language bindings.  The bindings for Python and OCaml were
relatively straightforward; the Golang bindings took a bit more effort
for me to write.  Temporary unused attributes are needed to keep the
compiler happy until a later patch exposes a new API using the new
callback type.

Note that 'struct nbd_block_descriptor_ext' in lib/nbd-protocol.h is
exactly the same as what we want to use in C.  But it is easier to
stick a new public type in  than to figure out how to expose
just part of a header we only want to use internally.

[1] https://zonedstorage.io/docs/linux/zbd-api
---
 generator/API.ml| 12 +++-
 generator/API.mli   |  1 +
 generator/C.ml  | 24 +---
 generator/GoLang.ml | 24 
 generator/OCaml.ml  | 19 ---
 generator/Python.ml | 21 -
 ocaml/helpers.c | 20 
 ocaml/nbd-c.h   |  3 ++-
 golang/handle.go|  8 +++-
 9 files changed, 122 insertions(+), 10 deletions(-)

diff --git a/generator/API.ml b/generator/API.ml
index beb7a2b4..85509867 100644
--- a/generator/API.ml
+++ b/generator/API.ml
@@ -42,6 +42,7 @@
 | BytesPersistOut of string * string
 | Closure of closure
 | Enum of string * enum
+| Extent64 of string
 | Fd of string
 | Flags of string * flags
 | Int of string
@@ -157,6 +158,14 @@ let extent_closure =
 "nr_entries");
  CBMutable (Int "error") ]
 }
+let extent64_closure = {
+  cbname = "extent64";
+  cbargs = [ CBString "metacontext";
+ CBUInt64 "offset";
+ CBArrayAndLen (Extent64 "entries",
+"nr_entries");
+ CBMutable (Int "error") ]
+}
 let list_closure = {
   cbname = "list";
   cbargs = [ CBString "name"; CBString "description" ]
@@ -166,7 +175,8 @@ let context_closure =
   cbargs = [ CBString "name" ]
 }
 let all_closures = [ chunk_closure; completion_closure;
- debug_closure; extent_closure; list_closure;
+ debug_closure; extent_closure; extent64_closure;
+ list_closure;
  context_closure ]

 (* Enums. *)
diff --git a/generator/API.mli b/generator/API.mli
index b0267705..12ad1fb4 100644
--- a/generator/API.mli
+++ b/generator/API.mli
@@ -52,6 +52,7 @@ and
 | BytesPersistOut of string * string
 | Closure of closure   (** function pointer + void *opaque *)
 | Enum of string * enum(** enum/union type, int in C *)
+| Extent64 of string   (** extent descriptor, struct nbd_extent in C *)
 | Fd of string (** file descriptor *)
 | Flags of string * flags  (** flags, uint32_t in C *)
 | Int of string(** small int *)
diff --git a/generator/C.ml b/generator/C.ml
index f9171996..51a4df65 100644
--- a/generator/C.ml
+++ b/generator/C.ml
@@ -93,6 +93,7 @@ let
   | Closure { cbname } ->
  [ sprintf "%s_callback" cbname; sprintf "%s_user_data" cbname ]
   | Enum (n, _) -> [n]
+  | Extent64 n -> [n]
   | Fd n -> [n]
   | Flags (n, _) -> [n]
   | Int n -> [n]
@@ -128,7 +129,7 @@ let
   (* list of strings should be marked as non-null *)
   | StringList n -> [ true ]
   (* numeric and other non-pointer types are not able to be null *)
-  | Bool _ | Closure _ | Enum _ | Fd _ | Flags _
+  | Bool _ | Closure _ | Enum _ | Extent64 _ | Fd _ | Flags _
   | Int _ | Int64 _ | SizeT _
   | UInt _ | UInt32 _ | UInt64 _ | UIntPtr _ -> [ false ]

@@ -182,6 +183,9 @@ and
   | Enum (n, _) ->
  if types then pr "int ";
  pr "%s" n
+  | Extent64 n ->
+ if types then pr "nbd_extent ";
+ pr "%s" n
   | Flags (n, _) ->
  if types then pr "uint32_t ";
  pr "%s" n
@@ -292,6 +296,11 @@ and
  pr "%s, " n;
  if types then

Re: [PATCH v2 2/3] hw/riscv: pfsoc: add missing FICs as unimplemented

2022-11-14 Thread Alistair Francis

On Sat, Nov 12, 2022 at 11:37 PM Conor Dooley  wrote:
>
> From: Conor Dooley 
>
> The Fabric Interconnect Controllers provide interfaces between the FPGA
> fabric and the core complex. There are 5 FICs on PolarFire SoC, numbered
> 0 through 4. FIC2 is an AXI4 slave interface from the FPGA fabric and
> does not show up on the MSS memory map. FIC4 is dedicated to the User
> Crypto Processor and does not show up on the MSS memory map either.
>
> FIC 0, 1 & 3 do show up in the MSS memory map and neither FICs 0 or 1
> are represented in QEMU, leading to load access violations while booting
> Linux for Icicle if PCIe is enabled as the root port is connected via
> either FIC 0 or 1.
>
> Signed-off-by: Conor Dooley 

Acked-by: Alistair Francis 

Alistair

> ---
>  hw/riscv/microchip_pfsoc.c | 115 -
>  include/hw/riscv/microchip_pfsoc.h |   2 +
>  2 files changed, 65 insertions(+), 52 deletions(-)
>
> diff --git a/hw/riscv/microchip_pfsoc.c b/hw/riscv/microchip_pfsoc.c
> index a821263d4f..2a24e3437a 100644
> --- a/hw/riscv/microchip_pfsoc.c
> +++ b/hw/riscv/microchip_pfsoc.c
> @@ -86,58 +86,61 @@
>   * describes the complete IOSCB modules memory maps
>   */
>  static const MemMapEntry microchip_pfsoc_memmap[] = {
> -[MICROCHIP_PFSOC_RSVD0] =   {0x0,  0x100 },
> -[MICROCHIP_PFSOC_DEBUG] =   {  0x100,  0xf00 },
> -[MICROCHIP_PFSOC_E51_DTIM] ={  0x100, 0x2000 },
> -[MICROCHIP_PFSOC_BUSERR_UNIT0] ={  0x170, 0x1000 },
> -[MICROCHIP_PFSOC_BUSERR_UNIT1] ={  0x1701000, 0x1000 },
> -[MICROCHIP_PFSOC_BUSERR_UNIT2] ={  0x1702000, 0x1000 },
> -[MICROCHIP_PFSOC_BUSERR_UNIT3] ={  0x1703000, 0x1000 },
> -[MICROCHIP_PFSOC_BUSERR_UNIT4] ={  0x1704000, 0x1000 },
> -[MICROCHIP_PFSOC_CLINT] =   {  0x200,0x1 },
> -[MICROCHIP_PFSOC_L2CC] ={  0x201, 0x1000 },
> -[MICROCHIP_PFSOC_DMA] = {  0x300,   0x10 },
> -[MICROCHIP_PFSOC_L2LIM] =   {  0x800,  0x200 },
> -[MICROCHIP_PFSOC_PLIC] ={  0xc00,  0x400 },
> -[MICROCHIP_PFSOC_MMUART0] = { 0x2000, 0x1000 },
> -[MICROCHIP_PFSOC_WDOG0] =   { 0x20001000, 0x1000 },
> -[MICROCHIP_PFSOC_SYSREG] =  { 0x20002000, 0x2000 },
> -[MICROCHIP_PFSOC_AXISW] =   { 0x20004000, 0x1000 },
> -[MICROCHIP_PFSOC_MPUCFG] =  { 0x20005000, 0x1000 },
> -[MICROCHIP_PFSOC_FMETER] =  { 0x20006000, 0x1000 },
> -[MICROCHIP_PFSOC_DDR_SGMII_PHY] =   { 0x20007000, 0x1000 },
> -[MICROCHIP_PFSOC_EMMC_SD] = { 0x20008000, 0x1000 },
> -[MICROCHIP_PFSOC_DDR_CFG] = { 0x2008,0x4 },
> -[MICROCHIP_PFSOC_MMUART1] = { 0x2010, 0x1000 },
> -[MICROCHIP_PFSOC_MMUART2] = { 0x20102000, 0x1000 },
> -[MICROCHIP_PFSOC_MMUART3] = { 0x20104000, 0x1000 },
> -[MICROCHIP_PFSOC_MMUART4] = { 0x20106000, 0x1000 },
> -[MICROCHIP_PFSOC_WDOG1] =   { 0x20101000, 0x1000 },
> -[MICROCHIP_PFSOC_WDOG2] =   { 0x20103000, 0x1000 },
> -[MICROCHIP_PFSOC_WDOG3] =   { 0x20105000, 0x1000 },
> -[MICROCHIP_PFSOC_WDOG4] =   { 0x20106000, 0x1000 },
> -[MICROCHIP_PFSOC_SPI0] ={ 0x20108000, 0x1000 },
> -[MICROCHIP_PFSOC_SPI1] ={ 0x20109000, 0x1000 },
> -[MICROCHIP_PFSOC_I2C0] ={ 0x2010a000, 0x1000 },
> -[MICROCHIP_PFSOC_I2C1] ={ 0x2010b000, 0x1000 },
> -[MICROCHIP_PFSOC_CAN0] ={ 0x2010c000, 0x1000 },
> -[MICROCHIP_PFSOC_CAN1] ={ 0x2010d000, 0x1000 },
> -[MICROCHIP_PFSOC_GEM0] ={ 0x2011, 0x2000 },
> -[MICROCHIP_PFSOC_GEM1] ={ 0x20112000, 0x2000 },
> -[MICROCHIP_PFSOC_GPIO0] =   { 0x2012, 0x1000 },
> -[MICROCHIP_PFSOC_GPIO1] =   { 0x20121000, 0x1000 },
> -[MICROCHIP_PFSOC_GPIO2] =   { 0x20122000, 0x1000 },
> -[MICROCHIP_PFSOC_RTC] = { 0x20124000, 0x1000 },
> -[MICROCHIP_PFSOC_ENVM_CFG] ={ 0x2020, 0x1000 },
> -[MICROCHIP_PFSOC_ENVM_DATA] =   { 0x2022,0x2 },
> -[MICROCHIP_PFSOC_USB] = { 0x20201000, 0x1000 },
> -[MICROCHIP_PFSOC_QSPI_XIP] ={ 0x2100,  0x100 },
> -[MICROCHIP_PFSOC_IOSCB] =   { 0x3000, 0x1000 },
> -[MICROCHIP_PFSOC_FABRIC_FIC3] = { 0x4000, 0x2000 },
> -[MICROCHIP_PFSOC_DRAM_LO] = { 0x8000, 0x4000 },
> -[MICROCHIP_PFSOC_DRAM_LO_ALIAS] =   { 0xc000, 0x4000 },
> -[MICROCHIP_PFSOC_DRAM_HI] =   { 0x10,0x0 },
> -[MICROCHIP_PFSOC_DRAM_HI_ALIAS] = { 0x14,0x0 },
> +[MICROCHIP_PFSOC_RSVD0] =

Re: [PATCH v2] capstone: use instead of

2022-11-14 Thread Michael Tokarev


14.11.2022 11:58, Daniel P. Berrangé wrote:
..

On current systems, using  works
now (despite the pkg-config-supplied -I/usr/include/capstone) -
since on all systems capstone headers are put into capstone/
subdirectory of a system include dir. So this change is
compatible with both the obsolete way of including it
and the only future way.


AFAIR, macOS HomeBrew does not put anything into the system
include dir, and always requires -I flags to be correct.


Does it work with the capstone-supplied --cflags and the proposed
include path?  What does pkg-config --cflags capstone return there?

..

-  if capstone.found() and not cc.compiles('#include ',
+  if capstone.found() and not cc.compiles('#include ',
dependencies: [capstone])


To retain back compat this could probe for both ways

 if capstone.found()
 if cc.compiles('#include ',
   dependencies: [capstone])
...
 else if cc.compiles('#include ',
dependencies: [capstone])
...
 
then, the source file can try the correct #include based on what

we detect works here.


I don't think this deserves the complexity really, unless there *is*
a system out there which actually needs this.

I mean, these little compat tweaks, - it becomes twisty with time,
and no one knows which code paths and config variables are needed
for what, and whole thing slowly becomes unmanageable... If it's
easy to make it unconditional, it should be done. IMHO anyway :)

Thanks!

/mjt

Re: [PATCH] avocado: use sha1 for fc31 imgs to avoid first time re-download

2022-11-14 Thread Jan Richter

On 11/10/22 20:29, Daniel Henrique Barboza wrote:

On 11/10/22 11:57, Jan Richter wrote:

On 11/10/22 00:26, Philippe Mathieu-Daudé wrote:

On 9/11/22 16:39, Daniel Henrique Barboza wrote:

On 10/27/22 06:01, Daniel P. Berrangé wrote:

On Thu, Oct 27, 2022 at 09:46:29AM +0200, Thomas Huth wrote:

On 24/10/2022 11.02, Daniel P. Berrangé wrote:
On Sat, Oct 22, 2022 at 02:03:50PM -0300, Daniel Henrique Barboza
wrote:
'make check-avocado' will download any images that aren't
present in the
cache via 'get-vm-images' in tests/Makefile.include. The target
that
downloads fedora 31 images, get-vm-image-fedora-31, will use
'avocado

vmimage get --distro=fedora --distro-version=31 --arch=(...)' to
download the image for each arch. Note that this command does not
support any argument to set the hash algorithm used and, based
on the
avocado source code [1], DEFAULT_HASH_ALGORITHM is set to
"sha1". The
sha1 hash is stored in a
Fedora-Cloud-Base-31-1.9.{ARCH}.qcow2-CHECKSUM

in the cache.

For now, in QEMU, let's use sha1 for all Fedora 31 images. This
will
immediately spares us at least one extra download for each
Fedora 31

image that we're doing in all our CI runs.

[1] https://github.com/avocado-framework/avocado.git @
942a5d6972906

[2] https://github.com/avocado-framework/avocado/issues/5496

Can we just ask Avocado maintainers to fix this problem on their
side to allow use of a modern hash alg as a priority item. We've
already had this problem in QEMU for over a year AFAICT, so doesn't
seem like we need to urgently do a workaround on QEMU side, so we
can get Avocado devs to commit to fixing it in the next month.

Do we have such a commitment? ... The avocado version in QEMU is
completely
backlevel these days, it's still using version 88.1 from May 2021,
i.e.
there hasn't been any update since more than a year. I recently
tried to
bump it to a newer version on my own (since I'm still suffering
from the
problem that find_free_port() does not work if you don't have a
local IPv6
address), but it's not that straight forward since the recent
versions of
avocado changed a lot of things (e.g. the new nrunner - do we want
to run
tests in parallel? If so it breaks a lot of the timeout settings,
I think),

so an update needs a lot of careful testing...

Hi Daniel,

if the problem of migrating avocado to latest version on qemu is only
in parallel run, I would suggest to disable it with
`nrunner.max_parallel_tasks` [1]. Even that the differences between
avocado legacy runner and nrunner is huge, the migration should be
straight forward. So if you have more issues with migration to the
nrunner, I would be happy to help you with that.

[1]
https://avocado-framework.readthedocs.io/en/latest/config/index.html#nrunner-max-parallel-tasks

Thanks Jan and Phil for the infos. I didn't manage to do a successful
Avocado run with the QEMU test though.

What I did, aside from the changes that Phil mentioned in
tests/requirements.txt:

- created a /etc/avocado/avocado.conf to store the settings
- copied python/avocado.cfg from QEMU to avocado.conf
- added the following in avocado.conf:

[nrunner]
max_parallel_tasks = 1

This allowed me to set Avocado as it would be if running with QEMU
avocado, but even then I had no success. The test get stuck indefinitely
at this point:

(...)
2022-11-10 16:00:20,758 avocado.test INFO | Temporary dir:
/var/tmp/avocado_tmp_znhvpbh0/avocado_job_ywyf7v30

2022-11-10 16:00:20,758 avocado.test INFO |
2022-11-10 16:00:20,758 avocado.test INFO | Job ID:
4bb3e2a12c05d84a0a06849ecef435d547a198a0

2022-11-10 16:00:20,758 avocado.test INFO |
2022-11-10 16:00:21,041 avocado.core.task.statemachine DEBUG| spawner="0x7fdad5da5840>" max_triaging=12 max_running=1 task_timeout=None> has
been initialized
2022-11-10 16:00:21,041 avocado.core.task.statemachine DEBUG| Task
"1-tests/avocado/boot_linux.py:BootLinuxPPC64.test_pseries_tcg":
requested -> triaging
2022-11-10 16:00:21,042 avocado.core.task.statemachine DEBUG| Task
"1-tests/avocado/boot_linux.py:BootLinuxPPC64.test_pseries_tcg":
requirements OK (will proceed to check dependencies)
2022-11-10 16:00:21,042 avocado.core.task.statemachine DEBUG| Task
"1-tests/avocado/boot_linux.py:BootLinuxPPC64.test_pseries_tcg": about
to be spawned with "object at 0x7fdad5da5840>"
2022-11-10 16:00:21,043 avocado.core.task.statemachine DEBUG| Task
"1-tests/avocado/boot_linux.py:BootLinuxPPC64.test_pseries_tcg": spawned
successfully

No error is provided. Tried to run the test for 10+ minutes and nothing
happens. Hitting CTRL+C aborts the test:

$ make check-avocado
AVOCADO_TESTS='tests/avocado/boot_linux.py:BootLinuxPPC64.test_pseries_tcg'
GIT ui/keycodemapdb tests/fp/berkeley-testfloat-3
tests/fp/berkeley-softfloat-3 dtc

AVOCADO Downloading avocado tests VM image for ppc64le
The image was downloaded:
Provider Version Architecture File
fedora 31 ppc64le
/home/danielhb/avocado/data/cache/by_location/d73d707673

Re: [PATCH] docs/system/s390x: Document the "loadparm" machine property

2022-11-14 Thread Claudio Imbrenda

On Mon, 14 Nov 2022 14:25:02 +0100
Thomas Huth  wrote:

> The "loadparm" machine property is useful for selecting alternative
> kernels on the disk of the guest, but so far we do not tell the users
> yet how to use it. Add some documentation to fill this gap.
> 
> Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=2128235
> Signed-off-by: Thomas Huth 

Reviewed-by: Claudio Imbrenda 

> ---
>  docs/system/s390x/bootdevices.rst | 26 ++
>  1 file changed, 26 insertions(+)
> 
> diff --git a/docs/system/s390x/bootdevices.rst 
> b/docs/system/s390x/bootdevices.rst
> index b5950133e8..40089c35a9 100644
> --- a/docs/system/s390x/bootdevices.rst
> +++ b/docs/system/s390x/bootdevices.rst
> @@ -53,6 +53,32 @@ recommended to specify a CD-ROM device via ``-device 
> scsi-cd`` (as mentioned
>  above) instead.
>  
>  
> +Selecting kernels with the ``loadparm`` property
> +
> +
> +The ``s390-ccw-virtio`` machine supports the so-called ``loadparm`` parameter
> +which can be used to select the kernel on the disk of the guest that the
> +s390-ccw bios should boot. When starting QEMU, it can be specified like 
> this::
> +
> + qemu-system-s390x -machine s390-ccw-virtio,loadparm=
> +
> +The first way to use this parameter is to use the word ``PROMPT`` as the
> + here. In that case the s390-ccw bios will show a list of
> +installed kernels on the disk of the guest and ask the user to enter a number
> +to chose which kernel should be booted -- similar to what can be achieved by
> +specifying the ``-boot menu=on`` option when starting QEMU. Note that the 
> menu
> +list will only show the names of the installed kernels when using a DASD-like
> +disk image with 4k byte sectors, on normal SCSI-style disks with 512-byte
> +sectors, there is not enough space for the zipl loader on the disk to store
> +the kernel names, so you only get a list without names here.
> +
> +The second way to use this parameter is to use a number in the range from 0
> +to 31. The numbers that can be used here correspond to the numbers that are
> +shown when using the ``PROMPT`` option, and the s390-ccw bios will then try
> +to automatically boot the kernel that is associated with the given number.
> +Note that ``0`` can be used to boot the default entry.
> +
> +
>  Booting from a network device
>  -
>

[libnbd PATCH v2 11/23] api: Add several functions for controlling extended headers

The new NBD_OPT_EXTENDED_HEADERS feature is worth using by default,
but there may be cases where the user explicitly wants to stick with
the older 32-bit headers.  nbd_set_request_extended_headers() will let
the client override the default, nbd_get_request_extended_headers()
determines the current state of the request, and
nbd_get_extended_headers_negotiated() determines what the client and
server actually settled on.  These use
nbd_set_request_structured_replies() and friends as a template.

Note that this patch just adds the API for controlling the defaults,
but ignores the state variable; a later one will then tweak the state
machine to actually request structured headers when the state variable
is set, as well as add nbd_opt_extended_headers() for manual control.

The intent is that because extended headers takes priority over
structured replies, the functions will interact as follows during the
automatic handshaking that takes place prior to nbd_opt_set_mode()
relinquishing control to the client in negotiation mode:

1. client default: request_eh=1, request_sr=1
 => try EXTENDED_HEADERS
- server supports it: nothing further to do, use extended headers,
  but also report structured replies as active
- server lacks it: behave like case 2
2. request_eh=0 (explicit client downgrade), request_sr=1 (left at default)
 => try STRUCTURED_REPLY
- server supports it: expect structured replies
- server lacks it: expect simple replies
3. request_sr=0 (explicit client downgrade), request_eh ignored
 => don't try either handshake
- expect simple replies

Client code that wants to manually force simple replies only has to
disable structured replies; and only code that wants to disable
extended headers but still use structured replies has to use the new
nbd_set_request_extended_headers() API.  Until a later patch adds an
explicit API nbd_opt_extended_headers(), there is no way to request
extended headers after structured replies are already negotiated.
---
 lib/internal.h|   1 +
 generator/API.ml  | 115 --
 generator/states-newstyle-opt-starttls.c  |   1 +
 .../states-newstyle-opt-structured-reply.c|   3 +-
 lib/handle.c  |  23 
 python/t/110-defaults.py  |   1 +
 python/t/120-set-non-defaults.py  |   2 +
 ocaml/tests/test_110_defaults.ml  |   2 +
 ocaml/tests/test_120_set_non_defaults.ml  |   3 +
 golang/libnbd_110_defaults_test.go|   8 ++
 golang/libnbd_120_set_non_defaults_test.go|  12 ++
 11 files changed, 160 insertions(+), 11 deletions(-)

diff --git a/lib/internal.h b/lib/internal.h
index b91fe6f6..1abb21cb 100644
--- a/lib/internal.h
+++ b/lib/internal.h
@@ -110,6 +110,7 @@ struct nbd_handle {
   char *tls_psk_file;   /* PSK filename, NULL = no PSK */

   /* Extended headers. */
+  bool request_eh;  /* Whether to request extended headers */
   bool extended_headers;/* If we negotiated NBD_OPT_EXTENDED_HEADERS */

   /* Desired metadata contexts. */
diff --git a/generator/API.ml b/generator/API.ml
index aeee41fb..3d0289f6 100644
--- a/generator/API.ml
+++ b/generator/API.ml
@@ -808,6 +808,77 @@   "get_tls_psk_file", {
   };
 *)

+  "set_request_extended_headers", {
+default_call with
+args = [Bool "request"]; ret = RErr;
+permitted_states = [ Created ];
+shortdesc = "control use of extended headers";
+longdesc = "\
+By default, libnbd tries to negotiate extended headers with the
+server, as this protocol extension permits the use of 64-bit
+zero, trim, and block status actions.  However,
+for integration testing, it can be useful to clear this flag
+rather than find a way to alter the server to fail the negotiation
+request.
+
+For backwards compatibility, the setting of this knob is ignored
+if L is also set to false,
+since the use of extended headers implies structured replies.";
+see_also = [Link "get_request_extended_headers";
+Link "set_handshake_flags"; Link "set_strict_mode";
+Link "get_extended_headers_negotiated";
+Link "zero"; Link "trim"; Link "cache";
+Link "block_status_64";
+Link "set_request_structured_replies"];
+  };
+
+  "get_request_extended_headers", {
+default_call with
+args = []; ret = RBool;
+may_set_error = false;
+shortdesc = "see if extended headers are attempted";
+longdesc = "\
+Return the state of the request extended headers flag on this
+handle.
+
+B If you want to find out if extended headers were actually
+negotiated on a particular connection use
+L instead.";
+see_also = [Link "set_request_extended_headers";
+Link "get_extended_headers_negotiated";
+Link "get_request_extended_headers"];
+  };
+
+  "get_extended_headers_negotiated", {
+default_call with
+args = []; ret = RBool;
+

[libnbd PATCH v2 17/23] ocaml: Add example for 64-bit extents

Since our example program for 32-bit extents is inherently limited to
32-bit lengths, it is also worth demonstrating the 64-bit extent API,
including the difference in the array indexing being saner.
---
 ocaml/examples/Makefile.am  |  3 ++-
 ocaml/examples/extents64.ml | 42 +
 2 files changed, 44 insertions(+), 1 deletion(-)
 create mode 100644 ocaml/examples/extents64.ml

diff --git a/ocaml/examples/Makefile.am b/ocaml/examples/Makefile.am
index 5ee6dd63..c6f4989d 100644
--- a/ocaml/examples/Makefile.am
+++ b/ocaml/examples/Makefile.am
@@ -1,5 +1,5 @@
 # nbd client library in userspace
-# Copyright (C) 2013-2019 Red Hat Inc.
+# Copyright (C) 2013-2022 Red Hat Inc.
 #
 # This library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
@@ -20,6 +20,7 @@ include $(top_srcdir)/subdir-rules.mk
 ml_examples = \
asynch_copy.ml \
extents.ml \
+   extents64.ml \
get_size.ml \
open_qcow2.ml \
server_flags.ml \
diff --git a/ocaml/examples/extents64.ml b/ocaml/examples/extents64.ml
new file mode 100644
index ..8ee7e218
--- /dev/null
+++ b/ocaml/examples/extents64.ml
@@ -0,0 +1,42 @@
+open Printf
+
+let () =
+  NBD.with_handle (
+fun nbd ->
+  NBD.add_meta_context nbd "base:allocation";
+  NBD.connect_command nbd
+  ["nbdkit"; "-s"; "--exit-with-parent"; "-r";
+   "sparse-random"; "8G"];
+
+  (* Read the extents and print them. *)
+  let size = NBD.get_size nbd in
+  let cap =
+match NBD.get_extended_headers_negotiated nbd with
+| true -> size
+| false -> 0x8000__L in
+  let fetch_offset = ref 0_L in
+  while !fetch_offset < size do
+let remaining = Int64.sub size !fetch_offset in
+let fetch_size = min remaining cap in
+NBD.block_status_64 nbd fetch_size !fetch_offset (
+  fun meta _ entries err ->
+printf "nbd_block_status callback: meta=%s err=%d\n" meta !err;
+if meta = "base:allocation" then (
+  printf "index\t%16s %16s %s\n" "offset" "length" "flags";
+  for i = 0 to Array.length entries - 1 do
+let len = fst entries.(i)
+and flags =
+  match snd entries.(i) with
+  | 0_L -> "data"
+  | 1_L -> "hole"
+  | 2_L -> "zero"
+  | 3_L -> "hole+zero"
+  | unknown -> sprintf "unknown (%Ld)" unknown in
+printf "%d:\t%16Ld %16Ld %s\n" i !fetch_offset len flags;
+fetch_offset := Int64.add !fetch_offset len
+  done;
+);
+0
+) (* NBD.block_status *)
+  done
+  )
-- 
2.38.1

[libnbd PATCH v2 13/23] dump: Update nbddump to use 64-bit block status

Although our use of "base:allocation" doesn't require the use of the
64-bit API for flags, we might perform slightly faster for a server
that does give us 64-bit extent lengths.
---
 dump/dump.c | 27 ++-
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/dump/dump.c b/dump/dump.c
index bdbc9040..0427ab86 100644
--- a/dump/dump.c
+++ b/dump/dump.c
@@ -38,7 +38,7 @@
 #include "version.h"
 #include "vector.h"

-DEFINE_VECTOR_TYPE (uint32_vector, uint32_t)
+DEFINE_VECTOR_TYPE (uint64_vector, uint64_t)

 static const char *progname;
 static struct nbd_handle *nbd;
@@ -262,10 +262,10 @@ catch_signal (int sig)
 static int
 extent_callback (void *user_data, const char *metacontext,
  uint64_t offset,
- uint32_t *entries, size_t nr_entries,
+ nbd_extent *entries, size_t nr_entries,
  int *error)
 {
-  uint32_vector *list = user_data;
+  uint64_vector *list = user_data;
   size_t i;

   if (strcmp (metacontext, LIBNBD_CONTEXT_BASE_ALLOCATION) != 0)
@@ -273,7 +273,8 @@ extent_callback (void *user_data, const char *metacontext,

   /* Just append the entries we got to the list. */
   for (i = 0; i < nr_entries; ++i) {
-if (uint32_vector_append (list, entries[i]) == -1) {
+if (uint64_vector_append (list, entries[i].length) == -1 ||
+uint64_vector_append (list, entries[i].flags) == -1) {
   perror ("realloc");
   exit (EXIT_FAILURE);
 }
@@ -284,7 +285,7 @@ extent_callback (void *user_data, const char *metacontext,
 static bool
 test_all_zeroes (uint64_t offset, size_t count)
 {
-  uint32_vector entries = empty_vector;
+  uint64_vector entries = empty_vector;
   size_t i;
   uint64_t count_read;

@@ -296,22 +297,22 @@ test_all_zeroes (uint64_t offset, size_t count)
* false, causing the main code to do a full read.  We could be
* smarter and keep asking the server (XXX).
*/
-  if (nbd_block_status (nbd, count, offset,
-(nbd_extent_callback) {
-  .callback = extent_callback,
-  .user_data = &entries },
-0) == -1) {
+  if (nbd_block_status_64 (nbd, count, offset,
+   (nbd_extent64_callback) {
+ .callback = extent_callback,
+ .user_data = &entries },
+   0) == -1) {
 fprintf (stderr, "%s: %s\n", progname, nbd_get_error ());
 exit (EXIT_FAILURE);
   }

   count_read = 0;
   for (i = 0; i < entries.len; i += 2) {
-uint32_t len = entries.ptr[i];
-uint32_t type = entries.ptr[i+1];
+uint64_t len = entries.ptr[i];
+uint64_t type = entries.ptr[i+1];

 count_read += len;
-if (!(type & 2))/* not zero */
+if (!(type & LIBNBD_STATE_ZERO))/* not zero */
   return false;
   }

-- 
2.38.1

Re: [PULL 00/11] Block layer patches


On 11.11.22 20:20, Stefan Hajnoczi wrote:

On Fri, 11 Nov 2022 at 10:29, Kevin Wolf  wrote:

The following changes since commit 2ccad61746ca7de5dd3e25146062264387e43bd4:

   Merge tag 'pull-tcg-20221109' of https://gitlab.com/rth7680/qemu into 
staging (2022-11-09 13:26:45 -0500)

are available in the Git repository at:

   https://repo.or.cz/qemu/kevin.git tags/for-upstream

for you to fetch changes up to b04af371af685c12970ea93027dc6d8bf86265aa:

   tests/stream-under-throttle: New test (2022-11-11 13:02:43 +0100)


Block layer patches

- Fix deadlock in graph modification with iothreads
- mirror: Fix non-converging cases for active mirror
- qapi: Fix BlockdevOptionsNvmeIoUring @path description
- blkio: Set BlockDriver::has_variable_length to false


Alberto Faria (2):
   qapi/block-core: Fix BlockdevOptionsNvmeIoUring @path description
   block/blkio: Set BlockDriver::has_variable_length to false

Hanna Reitz (9):
   block/mirror: Do not wait for active writes
   block/mirror: Drop mirror_wait_for_any_operation()
   block/mirror: Fix NULL s->job in active writes
   iotests/151: Test that active mirror progresses
   iotests/151: Test active requests on mirror start
   block: Make bdrv_child_get_parent_aio_context I/O
   block-backend: Update ctx immediately after root
   block: Start/end drain on correct AioContext
   tests/stream-under-throttle: New test

Hi Hanna,
This test is broken, probably due to the minimum Python version:
https://gitlab.com/qemu-project/qemu/-/jobs/3311521303


:(

I just took the exception name (asyncio.exceptions.TimeoutError) from 
the test output when a timeout occurred, seems indeed like that’s too 
new.  I’m not entirely sure when that was introduced, and what it’s 
relationship to asyncio.TimeoutError is – in 3.11, the latter is an 
alias for the former, but I have 3.10 myself, where the documentation 
says both are distinct.  Anyway, using either works fine here, and the 
existing scripts in python/qemu use asyncio.TimeoutError, so I’ve sent a 
v2 of the patch to do the same.


(For the record, this test isn’t vital for anything, so just dropping it 
from the pull request seems perfectly fine to me.)


Hanna

Re: [PATCH v2 1/5] migration: Fix possible infinite loop of ram save process

Peter Xu  wrote:
> When starting ram saving procedure (especially at the completion phase),
> always set last_seen_block to non-NULL to make sure we can always correctly
> detect the case where "we've migrated all the dirty pages".
>
> Then we'll guarantee both last_seen_block and pss.block will be valid
> always before the loop starts.
>
> See the comment in the code for some details.
>
> Reviewed-by: Dr. David Alan Gilbert 
> Signed-off-by: Peter Xu 

Reviewed-by: Juan Quintela 

queued

KVM call for 2022-11-15

2022-11-14 Thread quintela



Hi

Please, send any topic that you are interested in covering.

We already have some topics:
Re agenda, see below topics our team would like to discuss:

   - QEMU support for kernel/vfio V2 live migration patches
   - acceptance of changes required for Grace/Hopper passthrough and vGPU
   support
  - the migration support is now looking like it will converge on the
  6.2 kernel
   - tuning GPU migration performance on QEMU/vfio, beyond what the V2 work
   delivers


 Call details:

By popular demand, a google calendar public entry with it

  
https://www.google.com/calendar/embed?src=dG9iMXRqcXAzN3Y4ZXZwNzRoMHE4a3BqcXNAZ3JvdXAuY2FsZW5kYXIuZ29vZ2xlLmNvbQ

(Let me know if you have any problems with the calendar entry.  I just
gave up about getting right at the same time CEST, CET, EDT and DST).

If you need phone number details,  contact me privately

Thanks, Juan.

Re: [PATCH 08/13] stream: Replace subtree drain with a single node drain


On 08.11.22 13:37, Kevin Wolf wrote:

The subtree drain was introduced in commit b1e1af394d9 as a way to avoid
graph changes between finding the base node and changing the block graph
as necessary on completion of the image streaming job.

The block graph could change between these two points because
bdrv_set_backing_hd() first drains the parent node, which involved
polling and can do anything.

Subtree draining was an imperfect way to make this less likely (because
with it, fewer callbacks are called during this window). Everyone agreed
that it's not really the right solution, and it was only committed as a
stopgap solution.

This replaces the subtree drain with a solution that simply drains the
parent node before we try to find the base node, and then call a version
of bdrv_set_backing_hd() that doesn't drain, but just asserts that the
parent node is already drained.

This way, any graph changes caused by draining happen before we start
looking at the graph and things stay consistent between finding the base
node and changing the graph.

Signed-off-by: Kevin Wolf 
---
  include/block/block-global-state.h |  3 +++
  block.c| 17 ++---
  block/stream.c | 20 ++--
  3 files changed, 27 insertions(+), 13 deletions(-)


Reviewed-by: Hanna Reitz

Re: [PATCH v4] qapi/qom: Memory backend property prealloc-threads doc fix

2022-11-14 Thread Markus Armbruster

Zhenyu Zhang  writes:

> Commit ffac16fab3 "hostmem: introduce "prealloc-threads" property"
> (v5.0.0) changed the default number of threads from number of CPUs
> to 1.  This was deemed a regression, and fixed in commit f8d426a685
> "hostmem: default the amount of prealloc-threads to smp-cpus".
> Except the documentation remained unchanged.
>
> Update 'qapi/qom.json' to reflect the change in commit f8d426a6852c.
>
> Signed-off-by: Zhenyu Zhang 
> ---
>
> v4: Fix the line exceeding 80 characters limitation issue  (Gavin)
> v3: Covers historical descriptions (Markus)
> v2: The property is changed to smp-cpus since 5.0  (Phild)
>
> ---
>  qapi/qom.json | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/qapi/qom.json b/qapi/qom.json
> index 30e76653ad..f4a7917f3d 100644
> --- a/qapi/qom.json
> +++ b/qapi/qom.json
> @@ -576,7 +576,8 @@
>  #
>  # @prealloc: if true, preallocate memory (default: false)
>  #
> -# @prealloc-threads: number of CPU threads to use for prealloc (default: 1)
> +# @prealloc-threads: number of CPU threads to use for prealloc 

Could drop "CPU" while there.  Not worth a respin by itself, but perhaps
the maintainer can drop it for you.

> +#(default: number of CPUs) (since 5.0)
>  #
>  # @prealloc-context: thread context to use for creation of preallocation 
> threads
>  #(default: none) (since 7.2)

Reviewed-by: Markus Armbruster

[libnbd PATCH v2 10/23] api: Add [aio_]nbd_block_status_64

Overcome the inherent 32-bit limitation of our existing
nbd_block_status command by adding a 64-bit variant.  The command sent
to the server does not change, but the user's callback is now handed
64-bit information regardless of whether the server replies with 32-
or 64-bit extents.

Unit tests prove that the new API works in each of C, Python, OCaml,
and Go bindings.  We can also get rid of the temporary hack added to
appease the compiler in an earlier patch.
---
 docs/libnbd.pod   |  18 +--
 sh/nbdsh.pod  |   2 +-
 generator/API.ml  | 144 +++---
 generator/OCaml.ml|   1 -
 generator/Python.ml   |   1 -
 lib/rw.c  |  48 ++--
 python/t/465-block-status-64.py   |  56 +
 ocaml/tests/Makefile.am   |   1 +
 ocaml/tests/test_465_block_status_64.ml   |  58 +
 tests/meta-base-allocation.c  | 111 +++--
 fuzzing/libnbd-fuzz-wrapper.c |  22 +++-
 golang/Makefile.am|   1 +
 golang/libnbd_465_block_status_64_test.go | 119 ++
 13 files changed, 534 insertions(+), 48 deletions(-)
 create mode 100644 python/t/465-block-status-64.py
 create mode 100644 ocaml/tests/test_465_block_status_64.ml
 create mode 100644 golang/libnbd_465_block_status_64_test.go

diff --git a/docs/libnbd.pod b/docs/libnbd.pod
index 170fc1fa..b57cfa46 100644
--- a/docs/libnbd.pod
+++ b/docs/libnbd.pod
@@ -661,14 +661,14 @@ In order to utilize block status, the client must call
 L prior to connecting, for each meta context
 in which it is interested, then check L after
 connection to see which contexts the server actually supports.  If a
-context is supported, the client can then use L
-with a callback function that will receive an array of 32-bit integer
-pairs describing consecutive extents within a context.  In each pair,
-the first integer is the length of the extent, the second is a bitmask
-description of that extent (for the "base:allocation" context, the
-bitmask may include C for unallocated portions of
-the file, and/or C for portions of the file known
-to read as zero).
+context is supported, the client can then use
+L with a callback function that will receive
+an array of structs describing consecutive extents within a context.
+Each struct gives the length of the extent, then a bitmask description
+of that extent (for the "base:allocation" context, the bitmask may
+include C for unallocated portions of the file,
+and/or C for portions of the file known to read as
+zero).

 There is a full example of requesting meta context and using block
 status available at
@@ -917,7 +917,7 @@ will tie up resources until L is eventually 
reached).
 =head2 Callbacks with C parameter

 Some of the high-level commands (L,
-L) involve the use of a callback function invoked
+L) involve the use of a callback function invoked
 by the state machine at appropriate points in the server's reply
 before the overall command is complete.  These callback functions,
 along with all of the completion callbacks, include a parameter
diff --git a/sh/nbdsh.pod b/sh/nbdsh.pod
index 4d14118c..666a31b6 100644
--- a/sh/nbdsh.pod
+++ b/sh/nbdsh.pod
@@ -59,7 +59,7 @@ Display brief command line help and exit.
 =item B<--base-allocation>

 Request the use of the "base:allocation" meta context, which is the
-most common context used with L.  This is
+most common context used with L.  This is
 equivalent to calling
 S> in the shell
 prior to connecting, and works even when combined with C<--uri> (while
diff --git a/generator/API.ml b/generator/API.ml
index 85509867..aeee41fb 100644
--- a/generator/API.ml
+++ b/generator/API.ml
@@ -1565,7 +1565,7 @@   "add_meta_context", {
 During connection libnbd can negotiate zero or more metadata
 contexts with the server.  Metadata contexts are features (such
 as C<\"base:allocation\">) which describe information returned
-by the L command (for C<\"base:allocation\">
+by the L command (for C<\"base:allocation\">
 this is whether blocks of data are allocated, zero or sparse).

 This call adds one metadata context to the list to be negotiated.
@@ -1592,7 +1592,7 @@   "add_meta_context", {
 Other metadata contexts are server-specific, but include
 C<\"qemu:dirty-bitmap:...\"> and C<\"qemu:allocation-depth\"> for
 qemu-nbd (see qemu-nbd I<-B> and I<-A> options).";
-see_also = [Link "block_status"; Link "can_meta_context";
+see_also = [Link "block_status_64"; Link "can_meta_context";
 Link "get_nr_meta_contexts"; Link "get_meta_context";
 Link "clear_meta_contexts"];
   };
@@ -1605,14 +1605,14 @@   "get_nr_meta_contexts", {
 During connection libnbd can negotiate zero or more metadata
 contexts with the server.  Metadata contexts are features (such
 as C<\"base:allocation\">) which describe information returned
-by the L comman

[libnbd PATCH v2 04/23] states: Prepare to send 64-bit requests

Support sending 64-bit requests if extended headers were negotiated.
This includes setting NBD_CMD_FLAG_PAYLOAD_LEN any time we send an
extended NBD_CMD_WRITE; this is such a fundamental part of the
protocol that for now it is easier to silently ignore whatever value
the user passes in for that bit in the flags parameter of nbd_pwrite
regardless of the current settings in set_strict_mode, rather than
trying to force the user to pass in the correct value to match whether
extended mode is negotiated.  However, when we later add APIs to give
the user more control for interoperability testing, it may be worth
adding a new set_strict_mode control knob to explicitly enable the
client to intentionally violate the protocol (the testsuite added in
this patch would then be updated to match).

At this point, h->extended_headers is permanently false (we can't
enable it until all other aspects of the protocol have likewise been
converted).

Support for using FLAG_PAYLOAD_LEN with NBD_CMD_BLOCK_STATUS is less
fundamental, and deserves to be in its own patch.
---
 lib/internal.h  |  10 ++-
 generator/API.ml|  20 +++--
 generator/states-issue-command.c|  29 ---
 generator/states-reply-structured.c |   2 +-
 lib/rw.c|  17 +++--
 tests/Makefile.am   |   4 +
 tests/pwrite-extended.c | 112 
 .gitignore  |   1 +
 8 files changed, 169 insertions(+), 26 deletions(-)
 create mode 100644 tests/pwrite-extended.c

diff --git a/lib/internal.h b/lib/internal.h
index f81c41ba..e900eca3 100644
--- a/lib/internal.h
+++ b/lib/internal.h
@@ -109,6 +109,9 @@ struct nbd_handle {
   char *tls_username;   /* Username, NULL = use current username */
   char *tls_psk_file;   /* PSK filename, NULL = no PSK */

+  /* Extended headers. */
+  bool extended_headers;/* If we negotiated NBD_OPT_EXTENDED_HEADERS */
+
   /* Desired metadata contexts. */
   bool request_sr;
   bool request_meta;
@@ -262,7 +265,10 @@ struct nbd_handle {
   /* Issuing a command must use a buffer separate from sbuf, for the
* case when we interrupt a request to service a reply.
*/
-  struct nbd_request request;
+  union {
+struct nbd_request compact;
+struct nbd_request_ext extended;
+  } req;
   bool in_write_payload;
   bool in_write_shutdown;

@@ -363,7 +369,7 @@ struct command {
   uint16_t type;
   uint64_t cookie;
   uint64_t offset;
-  uint32_t count;
+  uint64_t count;
   void *data; /* Buffer for read/write */
   struct command_cb cb;
   bool initialized; /* For read, true if getting a hole may skip memset */
diff --git a/generator/API.ml b/generator/API.ml
index 25a612a2..beb7a2b4 100644
--- a/generator/API.ml
+++ b/generator/API.ml
@@ -198,11 +198,12 @@ let cmd_flags =
   flag_prefix = "CMD_FLAG";
   guard = Some "((h->strict & LIBNBD_STRICT_FLAGS) || flags > UINT16_MAX)";
   flags = [
-"FUA",   1 lsl 0;
-"NO_HOLE",   1 lsl 1;
-"DF",1 lsl 2;
-"REQ_ONE",   1 lsl 3;
-"FAST_ZERO", 1 lsl 4;
+"FUA", 1 lsl 0;
+"NO_HOLE", 1 lsl 1;
+"DF",  1 lsl 2;
+"REQ_ONE", 1 lsl 3;
+"FAST_ZERO",   1 lsl 4;
+"PAYLOAD_LEN", 1 lsl 5;
   ]
 }
 let handshake_flags = {
@@ -2459,7 +2460,7 @@   "pread_structured", {
   "pwrite", {
 default_call with
 args = [ BytesIn ("buf", "count"); UInt64 "offset" ];
-optargs = [ OFlags ("flags", cmd_flags, Some ["FUA"]) ];
+optargs = [ OFlags ("flags", cmd_flags, Some ["FUA"; "PAYLOAD_LEN"]) ];
 ret = RErr;
 permitted_states = [ Connected ];
 shortdesc = "write to the NBD server";
@@ -2482,7 +2483,10 @@   "pwrite", {
 C meaning that the server should not
 return until the data has been committed to permanent storage
 (if that is supported - some servers cannot do this, see
-L)."
+L).  For convenience, libnbd ignores the presence
+or absence of the flag C in C,
+while correctly using the flag over the wire according to whether
+extended headers were negotiated."
 ^ strict_call_description;
 see_also = [Link "can_fua"; Link "is_read_only";
 Link "aio_pwrite"; Link "get_block_size";
@@ -3172,7 +3176,7 @@   "aio_pwrite", {
 default_call with
 args = [ BytesPersistIn ("buf", "count"); UInt64 "offset" ];
 optargs = [ OClosure completion_closure;
-OFlags ("flags", cmd_flags, Some ["FUA"]) ];
+OFlags ("flags", cmd_flags, Some ["FUA"; "PAYLOAD_LEN"]) ];
 ret = RCookie;
 permitted_states = [ Connected ];
 shortdesc = "write to the NBD server";
diff --git a/generator/states-issue-command.c b/generator/states-issue-command.c
index df9295b5..feea2672 100644
--- a/generator/states-issue-command.c
+++ b/generator/states-issue-command.c
@@ -41,15 +41,24 @@  ISSUE_COMMAND.START:
 return 0;
   }

-  h->request.magic = htobe32 (NBD_REQUEST_MAGIC);
-  h->request.flags = htobe16 (cmd->flags);
-  h->

Re: [PATCH v9 2/8] KVM: Extend the memslot to support fd-based private memory

2022-11-14 Thread Alex Bennée



Chao Peng  writes:

> In memory encryption usage, guest memory may be encrypted with special
> key and can be accessed only by the guest itself. We call such memory
> private memory. It's valueless and sometimes can cause problem to allow
> userspace to access guest private memory. This new KVM memslot extension
> allows guest private memory being provided though a restrictedmem
> backed file descriptor(fd) and userspace is restricted to access the
> bookmarked memory in the fd.
>

> To make code maintenance easy, internally we use a binary compatible
> alias struct kvm_user_mem_region to handle both the normal and the
> '_ext' variants.

> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 0d5d4419139a..f1ae45c10c94 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -103,6 +103,33 @@ struct kvm_userspace_memory_region {
>   __u64 userspace_addr; /* start of the userspace allocated memory */
>  };
>  
> +struct kvm_userspace_memory_region_ext {
> + struct kvm_userspace_memory_region region;
> + __u64 restricted_offset;
> + __u32 restricted_fd;
> + __u32 pad1;
> + __u64 pad2[14];
> +};
> +
> +#ifdef __KERNEL__
> +/*
> + * kvm_user_mem_region is a kernel-only alias of 
> kvm_userspace_memory_region_ext
> + * that "unpacks" kvm_userspace_memory_region so that KVM can directly access
> + * all fields from the top-level "extended" region.
> + */
> +struct kvm_user_mem_region {
> + __u32 slot;
> + __u32 flags;
> + __u64 guest_phys_addr;
> + __u64 memory_size;
> + __u64 userspace_addr;
> + __u64 restricted_offset;
> + __u32 restricted_fd;
> + __u32 pad1;
> + __u64 pad2[14];
> +};
> +#endif

I'm not sure I buy the argument this makes the code maintenance easier
because you now have multiple places to update if you extend the field.
Was this simply to avoid changing:

  foo->slot to foo->region.slot

in the underlying code?

> +
>  /*
>   * The bit 0 ~ bit 15 of kvm_memory_region::flags are visible for userspace,
>   * other bits are reserved for kvm internal use which are defined in
> @@ -110,6 +137,7 @@ struct kvm_userspace_memory_region {
>   */
>  #define KVM_MEM_LOG_DIRTY_PAGES  (1UL << 0)
>  #define KVM_MEM_READONLY (1UL << 1)
> +#define KVM_MEM_PRIVATE  (1UL << 2)
>  
>  /* for KVM_IRQ_LINE */
>  struct kvm_irq_level {
> @@ -1178,6 +1206,7 @@ struct kvm_ppc_resize_hpt {
>  #define KVM_CAP_S390_ZPCI_OP 221
>  #define KVM_CAP_S390_CPU_TOPOLOGY 222
>  #define KVM_CAP_DIRTY_LOG_RING_ACQ_REL 223
> +#define KVM_CAP_PRIVATE_MEM 224
>  
>  #ifdef KVM_CAP_IRQ_ROUTING
>  
> diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
> index 800f9470e36b..9ff164c7e0cc 100644
> --- a/virt/kvm/Kconfig
> +++ b/virt/kvm/Kconfig
> @@ -86,3 +86,6 @@ config KVM_XFER_TO_GUEST_WORK
>  
>  config HAVE_KVM_PM_NOTIFIER
> bool
> +
> +config HAVE_KVM_RESTRICTED_MEM
> +   bool
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index e30f1b4ecfa5..8dace78a0278 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -1526,7 +1526,7 @@ static void kvm_replace_memslot(struct kvm *kvm,
>   }
>  }
>  
> -static int check_memory_region_flags(const struct 
> kvm_userspace_memory_region *mem)
> +static int check_memory_region_flags(const struct kvm_user_mem_region *mem)
>  {
>   u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
>  
> @@ -1920,7 +1920,7 @@ static bool kvm_check_memslot_overlap(struct 
> kvm_memslots *slots, int id,
>   * Must be called holding kvm->slots_lock for write.
>   */
>  int __kvm_set_memory_region(struct kvm *kvm,
> - const struct kvm_userspace_memory_region *mem)
> + const struct kvm_user_mem_region *mem)
>  {
>   struct kvm_memory_slot *old, *new;
>   struct kvm_memslots *slots;
> @@ -2024,7 +2024,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
>  EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
>  
>  int kvm_set_memory_region(struct kvm *kvm,
> -   const struct kvm_userspace_memory_region *mem)
> +   const struct kvm_user_mem_region *mem)
>  {
>   int r;
>  
> @@ -2036,7 +2036,7 @@ int kvm_set_memory_region(struct kvm *kvm,
>  EXPORT_SYMBOL_GPL(kvm_set_memory_region);
>  
>  static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
> -   struct kvm_userspace_memory_region 
> *mem)
> +   struct kvm_user_mem_region *mem)
>  {
>   if ((u16)mem->slot >= KVM_USER_MEM_SLOTS)
>   return -EINVAL;
> @@ -4627,6 +4627,33 @@ static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm)
>   return fd;
>  }
>  
> +#define SANITY_CHECK_MEM_REGION_FIELD(field) 
> \
> +do { 
> \
> + BUILD_BUG_ON(offsetof(struct kvm_user_mem_region, field) != 
> \
> +  offsetof(struct

Re: [PATCH v6 09/10] vdpa: Add listener_shadow_vq to vhost_vdpa

2022-11-14 Thread Eugenio Perez Martin

On Mon, Nov 14, 2022 at 5:30 AM Jason Wang  wrote:
>
>
> 在 2022/11/11 21:12, Eugenio Perez Martin 写道:
> > On Fri, Nov 11, 2022 at 8:49 AM Jason Wang  wrote:
> >>
> >> 在 2022/11/10 21:47, Eugenio Perez Martin 写道:
> >>> On Thu, Nov 10, 2022 at 7:01 AM Jason Wang  wrote:
>  On Wed, Nov 9, 2022 at 1:08 AM Eugenio Pérez  wrote:
> > The memory listener that thells the device how to convert GPA to qemu's
> > va is registered against CVQ vhost_vdpa. This series try to map the
> > memory listener translations to ASID 0, while it maps the CVQ ones to
> > ASID 1.
> >
> > Let's tell the listener if it needs to register them on iova tree or
> > not.
> >
> > Signed-off-by: Eugenio Pérez 
> > ---
> > v5: Solve conflict about vhost_iova_tree_remove accepting mem_region by
> >   value.
> > ---
> >include/hw/virtio/vhost-vdpa.h | 2 ++
> >hw/virtio/vhost-vdpa.c | 6 +++---
> >net/vhost-vdpa.c   | 1 +
> >3 files changed, 6 insertions(+), 3 deletions(-)
> >
> > diff --git a/include/hw/virtio/vhost-vdpa.h 
> > b/include/hw/virtio/vhost-vdpa.h
> > index 6560bb9d78..0c3ed2d69b 100644
> > --- a/include/hw/virtio/vhost-vdpa.h
> > +++ b/include/hw/virtio/vhost-vdpa.h
> > @@ -34,6 +34,8 @@ typedef struct vhost_vdpa {
> >struct vhost_vdpa_iova_range iova_range;
> >uint64_t acked_features;
> >bool shadow_vqs_enabled;
> > +/* The listener must send iova tree addresses, not GPA */
> >>
> >> Btw, cindy's vIOMMU series will make it not necessarily GPA any more.
> >>
> > Yes, this comment should be tuned then. But the SVQ iova_tree will not
> > be equal to vIOMMU one because shadow vrings.
> >
> > But maybe SVQ can inspect both instead of having all the duplicated entries.
> >
> > +bool listener_shadow_vq;
> >/* IOVA mapping used by the Shadow Virtqueue */
> >VhostIOVATree *iova_tree;
> >GPtrArray *shadow_vqs;
> > diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
> > index 8fd32ba32b..e3914fa40e 100644
> > --- a/hw/virtio/vhost-vdpa.c
> > +++ b/hw/virtio/vhost-vdpa.c
> > @@ -220,7 +220,7 @@ static void 
> > vhost_vdpa_listener_region_add(MemoryListener *listener,
> > vaddr, section->readonly);
> >
> >llsize = int128_sub(llend, int128_make64(iova));
> > -if (v->shadow_vqs_enabled) {
> > +if (v->listener_shadow_vq) {
> >int r;
> >
> >mem_region.translated_addr = (hwaddr)(uintptr_t)vaddr,
> > @@ -247,7 +247,7 @@ static void 
> > vhost_vdpa_listener_region_add(MemoryListener *listener,
> >return;
> >
> >fail_map:
> > -if (v->shadow_vqs_enabled) {
> > +if (v->listener_shadow_vq) {
> >vhost_iova_tree_remove(v->iova_tree, mem_region);
> >}
> >
> > @@ -292,7 +292,7 @@ static void 
> > vhost_vdpa_listener_region_del(MemoryListener *listener,
> >
> >llsize = int128_sub(llend, int128_make64(iova));
> >
> > -if (v->shadow_vqs_enabled) {
> > +if (v->listener_shadow_vq) {
> >const DMAMap *result;
> >const void *vaddr = memory_region_get_ram_ptr(section->mr) +
> >section->offset_within_region +
> > diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
> > index 85a318faca..02780ee37b 100644
> > --- a/net/vhost-vdpa.c
> > +++ b/net/vhost-vdpa.c
> > @@ -570,6 +570,7 @@ static NetClientState 
> > *net_vhost_vdpa_init(NetClientState *peer,
> >s->vhost_vdpa.index = queue_pair_index;
> >s->always_svq = svq;
> >s->vhost_vdpa.shadow_vqs_enabled = svq;
> > +s->vhost_vdpa.listener_shadow_vq = svq;
>  Any chance those above two can differ?
> 
> >>> If CVQ is shadowed but data VQs are not, shadow_vqs_enabled is true
> >>> but listener_shadow_vq is not.
> >>>
> >>> It is more clear in the next commit, where only shadow_vqs_enabled is
> >>> set to true at vhost_vdpa_net_cvq_start.
> >>
> >> Ok, the name looks a little bit confusing. I wonder if it's better to
> >> use shadow_cvq and shadow_data ?
> >>
> > I'm ok with renaming it, but struct vhost_vdpa is generic across all
> > kind of devices, and it does not know if it is a datapath or not for
> > the moment.
> >
> > Maybe listener_uses_iova_tree?
>
>
> I think "iova_tree" is something that is internal to svq implementation,
> it's better to define the name from the view of vhost_vdpa level.
>

I don't get this, vhost_vdpa struct already has a pointer to its iova_tree.

Thanks!

[libnbd PATCH v2 00/23] libnbd 64-bit NBD extensions

This series is posted alongside a spec change to NBD, and
interoperable with changes posted to qemu-nbd/qemu-storage-daemon.
The RFC patch at the end is optional; ineroperability with qemu works
only when either both projects omit the RFC patch, or when both
projects include it (if only one of the two RFC projects include it,
the protocol is incompatible between the two, but at least client and
server gracefully detect the bug rather than SEGV'ing).

Eric Blake (23):
  block_status: Refactor array storage
  internal: Refactor layout of replies in sbuf
  protocol: Add definitions for extended headers
  states: Prepare to send 64-bit requests
  states: Prepare to receive 64-bit replies
  states: Break deadlock if server goofs on extended replies
  generator: Add struct nbd_extent in prep for 64-bit extents
  block_status: Track 64-bit extents internally
  block_status: Accept 64-bit extents during block status
  api: Add [aio_]nbd_block_status_64
  api: Add several functions for controlling extended headers
  copy: Update nbdcopy to use 64-bit block status
  dump: Update nbddump to use 64-bit block status
  info: Expose extended-headers support through nbdinfo
  info: Update nbdinfo --map to use 64-bit block status
  examples: Update copy-libev to use 64-bit block status
  ocaml: Add example for 64-bit extents
  generator: Actually request extended headers
  api: Add nbd_[aio_]opt_extended_headers()
  interop: Add test of 64-bit block status
  api: Add nbd_can_block_status_payload()
  api: Add nbd_[aio_]block_status_filter()
  RFC: pread: Accept 64-bit holes

 docs/libnbd.pod   |  18 +-
 info/nbdinfo.pod  |  21 +-
 sh/nbdsh.pod  |   2 +-
 lib/internal.h|  42 +-
 lib/nbd-protocol.h| 120 ++--
 generator/API.ml  | 532 +++---
 generator/API.mli |   1 +
 generator/C.ml|  24 +-
 generator/GoLang.ml   |  24 +
 generator/Makefile.am |   3 +-
 generator/OCaml.ml|  18 +-
 generator/Python.ml   |  20 +-
 generator/state_machine.ml|  50 +-
 generator/states-issue-command.c  |  33 +-
 .../states-newstyle-opt-extended-headers.c| 110 
 generator/states-newstyle-opt-starttls.c  |   7 +-
 .../states-newstyle-opt-structured-reply.c|   3 +-
 generator/states-newstyle.c   |   3 +
 generator/states-reply-simple.c   |   4 +-
 generator/states-reply-structured.c   | 279 ++---
 generator/states-reply.c  |  57 +-
 lib/aio.c |   7 +-
 lib/flags.c   |  11 +
 lib/handle.c  |  25 +-
 lib/opt.c |  44 ++
 lib/rw.c  | 250 +++-
 python/t/110-defaults.py  |   1 +
 python/t/120-set-non-defaults.py  |   2 +
 python/t/465-block-status-64.py   |  56 ++
 ocaml/examples/Makefile.am|   3 +-
 ocaml/examples/extents64.ml   |  42 ++
 ocaml/helpers.c   |  20 +
 ocaml/nbd-c.h |   3 +-
 ocaml/tests/Makefile.am   |   1 +
 ocaml/tests/test_110_defaults.ml  |   2 +
 ocaml/tests/test_120_set_non_defaults.ml  |   3 +
 ocaml/tests/test_465_block_status_64.ml   |  58 ++
 tests/Makefile.am |   4 +
 tests/meta-base-allocation.c  | 111 +++-
 tests/pwrite-extended.c   | 112 
 examples/copy-libev.c |  21 +-
 examples/server-flags.c   |   7 +-
 interop/Makefile.am   |  18 +
 interop/block-status-payload.c| 241 
 interop/block-status-payload.sh   |  80 +++
 interop/large-status.c| 186 ++
 interop/large-status.sh   |  49 ++
 interop/opt-extended-headers.c| 153 +
 interop/opt-extended-headers.sh   |  29 +
 .gitignore|   4 +
 copy/nbd-ops.c|  22 +-
 dump/dump.c   |  27 +-
 fuzzing/libnbd-fuzz-wrapper.c |  22 +-
 golang/Makefile.am|   1 +
 golang/handle.go  |   8 +-
 golang/libnbd_110_defaults_test.go|   8 +
 golang/libnbd_120_set_non_defaults_test.go|  12 +
 golang/libnbd_465_block_status_64_test.go | 119 
 info/can.c|  14 +
 info/info-can.sh  |  30 +
 info/info-packets.sh

Re: [PATCH v2 2/5] migration: Fix race on qemu_file_shutdown()

Peter Xu  wrote:
> In qemu_file_shutdown(), there's a possible race if with current order of
> operation.  There're two major things to do:
>
>   (1) Do real shutdown() (e.g. shutdown() syscall on socket)
>   (2) Update qemufile's last_error
>
> We must do (2) before (1) otherwise there can be a race condition like:
>
>   page receiver other thread
>   - 
>   qemu_get_buffer()
> do shutdown()
> returns 0 (buffer all zero)
> (meanwhile we didn't check this retcode)
>   try to detect IO error
> last_error==NULL, IO okay
>   install ALL-ZERO page
> set last_error
>   --> guest crash!
>
> To fix this, we can also check retval of qemu_get_buffer(), but not all
> APIs can be properly checked and ultimately we still need to go back to
> qemu_file_get_error().  E.g. qemu_get_byte() doesn't return error.
>
> Maybe some day a rework of qemufile API is really needed, but for now keep
> using qemu_file_get_error() and fix it by not allowing that race condition
> to happen.  Here shutdown() is indeed special because the last_error was
> emulated.  For real -EIO errors it'll always be set when e.g. sendmsg()
> error triggers so we won't miss those ones, only shutdown() is a bit tricky
> here.
>
> Cc: Daniel P. Berrange 
> Reviewed-by: Dr. David Alan Gilbert 
> Signed-off-by: Peter Xu 

Reviewed-by: Juan Quintela

Re: [PATCH v6 2/2] Unit test code and benchmark code

ling xu  wrote:
> Unit test code is in test-xbzrle.c, and benchmark code is in xbzrle-bench.c
> for performance benchmarking.
>
> Signed-off-by: ling xu 
> Co-authored-by: Zhou Zhao 
> Co-authored-by: Jun Jin 

Reviewed-by: Juan Quintela 

queued.

Re: [PATCH v6 05/10] vdpa: move SVQ vring features check to net/

2022-11-14 Thread Eugenio Perez Martin

On Mon, Nov 14, 2022 at 5:26 AM Jason Wang  wrote:
>
>
> 在 2022/11/11 20:58, Eugenio Perez Martin 写道:
> > On Fri, Nov 11, 2022 at 9:07 AM Jason Wang  wrote:
> >> On Fri, Nov 11, 2022 at 3:56 PM Eugenio Perez Martin
> >>  wrote:
> >>> On Fri, Nov 11, 2022 at 8:34 AM Jason Wang  wrote:
> 
>  在 2022/11/10 21:09, Eugenio Perez Martin 写道:
> > On Thu, Nov 10, 2022 at 6:40 AM Jason Wang  wrote:
> >> 在 2022/11/9 01:07, Eugenio Pérez 写道:
> >>> The next patches will start control SVQ if possible. However, we don't
> >>> know if that will be possible at qemu boot anymore.
> >> If I was not wrong, there's no device specific feature that is checked
> >> in the function. So it should be general enough to be used by devices
> >> other than net. Then I don't see any advantage of doing this.
> >>
> > Because vhost_vdpa_init_svq is called at qemu boot, failing if it is
> > not possible to shadow the Virtqueue.
> >
> > Now the CVQ will be shadowed if possible, so we need to check this at
> > device start, not at initialization.
> 
>  Any reason we can't check this at device start? We don't need
>  driver_features and we can do any probing to make sure cvq has an unique
>  group during initialization time.
> 
> >>> We need the CVQ index to check if it has an independent group. CVQ
> >>> index depends on the features the guest's ack:
> >>> * If it acks _F_MQ, it is the last one.
> >>> * If it doesn't, CVQ idx is 2.
> >>>
> >>> We cannot have acked features at initialization, and they could
> >>> change: It is valid for a guest to ack _F_MQ, then reset the device,
> >>> then not ack it.
> >> Can we do some probing by negotiating _F_MQ if the device offers it,
> >> then we can know if cvq has a unique group?
> >>
> > What if the guest does not ack _F_MQ?
> >
> > To be completed it would go like:
> >
> > * Probe negotiate _F_MQ, check unique group,
> > * Probe negotiate !_F_MQ, check unique group,
>
>
> I think it should be a bug if device present a unique virtqueue group
> that depends on a specific feature. That is to say, we can do a single
> round of probing instead of try it twice here.
>

I didn't mean a single virtqueue group but CVQ has its own group.

>From vhost POV the valid behavior is feature dependent already: If
_F_MQ is not negotiated, the vq in a different group must be the 3rd.
If it is negotiated, the vq in its own group must be the last one.

Since the check of the virtqueue group is already dependent on the vq
index I think it would be a mistake to change a vq group ASID without
checking if it is independent or not.

The consequence of not checking that is that:
* We will move all the dataplane vq group to ASID 1, so the device
will not work properly.
* The device will be migratable.

>
> > * Actually negotiate with the guest's feature set.
> > * React to failures. Probably the same way as if the CVQ is not
> > isolated, disabling SVQ?
> >
> > To me it seems simpler to specify somehow that the vq must be independent.
>
>
> It's just a suggestion, if you think doing it at the start, I'm fine.

Don't get me wrong, we can make changes to go toward that direction
and I think it's a good idea, especially to reduce syscalls. I'm just
trying to put all the scenarios on the table because maybe I'm missing
something to solve them, or we can ignore them

Thanks!

> But we need document the reason with a comment maybe.
>
> Thanks
>
>
> >
> > Thanks!
> >
> >To store this information at boot
> > time is not valid anymore, because v->shadow_vqs_enabled is not valid
> > at this time anymore.
> 
>  Ok, but this doesn't explain why it is net specific but vhost-vdpa 
>  specific.
> 
> >>> We can try to move it to a vhost op, but we have the same problem as
> >>> the svq array allocation: We don't have the right place in vhost ops
> >>> to check this. Maybe vhost_set_features is the right one here?
> >> If we can do all the probing at the initialization phase, we can do
> >> everything there.
> >>
> >> Thanks
> >>
> >>> Thanks!
> >>>
>  Thanks
> 
> 
> > Thanks!
> >
> >> Thanks
> >>
> >>
> >>> Since the moved checks will be already evaluated at net/ to know if it
> >>> is ok to shadow CVQ, move them.
> >>>
> >>> Signed-off-by: Eugenio Pérez 
> >>> ---
> >>> hw/virtio/vhost-vdpa.c | 33 ++---
> >>> net/vhost-vdpa.c   |  3 ++-
> >>> 2 files changed, 4 insertions(+), 32 deletions(-)
> >>>
> >>> diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
> >>> index 3df2775760..146f0dcb40 100644
> >>> --- a/hw/virtio/vhost-vdpa.c
> >>> +++ b/hw/virtio/vhost-vdpa.c
> >>> @@ -402,29 +402,9 @@ static int vhost_vdpa_get_dev_features(struct 
> >>> vhost_dev *dev,
> >>> return ret;
> >>> }
> >>>
> >>> -static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct 
> >

Re: [PATCH v2 04/15] migration: Trivial cleanup save_page_header() on same block check

Peter Xu  wrote:
> The 2nd check on RAM_SAVE_FLAG_CONTINUE is a bit redundant.  Use a boolean
> to be clearer.
>
> Reviewed-by: Dr. David Alan Gilbert 
> Signed-off-by: Peter Xu 

Reviewed-by: Juan Quintela

Re: [PATCH v1 5/9] hw/virtio: introduce virtio_device_should_start

2022-11-14 Thread Michael S. Tsirkin

On Mon, Nov 14, 2022 at 05:18:53PM +0100, Christian Borntraeger wrote:
> Am 08.11.22 um 10:23 schrieb Alex Bennée:
> > The previous fix to virtio_device_started revealed a problem in its
> > use by both the core and the device code. The core code should be able
> > to handle the device "starting" while the VM isn't running to handle
> > the restoration of migration state. To solve this dual use introduce a
> > new helper for use by the vhost-user backends who all use it to feed a
> > should_start variable.
> > 
> > We can also pick up a change vhost_user_blk_set_status while we are at
> > it which follows the same pattern.
> > 
> > Fixes: 9f6bcfd99f (hw/virtio: move vm_running check to 
> > virtio_device_started)
> > Fixes: 27ba7b027f (hw/virtio: add boilerplate for vhost-user-gpio device)
> > Signed-off-by: Alex Bennée 
> > Cc: "Michael S. Tsirkin" 
> 
> Hmmm, is this
> commit 259d69c00b67c02a67f3bdbeeea71c2c0af76c35
> Author: Alex Bennée 
> AuthorDate: Mon Nov 7 12:14:07 2022 +
> Commit: Michael S. Tsirkin 
> CommitDate: Mon Nov 7 14:08:18 2022 -0500
> 
> hw/virtio: introduce virtio_device_should_start
> 
> and older version?

This is what got merged:
https://lore.kernel.org/r/20221107121407.1010913-1-alex.bennee%40linaro.org
This patch was sent after I merged the RFC.
I think the only difference is the commit log but I might be missing
something.

> This does not seem to fix the regression that I have reported.

This was applied on top of 9f6bcfd99f which IIUC does, right?


-- 
MST

[libnbd PATCH v2 06/23] states: Break deadlock if server goofs on extended replies

One of the benefits of extended replies is that we can do a
fixed-length read for the entire header of every server reply, which
is fewer syscalls than the split-read approach required by structured
replies.  But one of the drawbacks of doing a large read is that if
the server is non-compliant (not a problem for normal servers, but
something I hit rather more than I'd like to admit while developing
extended header support in servers), nbd_pwrite() and friends will
deadlock if the server replies with the wrong header.  Add in some
code to catch that failure mode and move the state machine to DEAD
sooner, to make it easier to diagnose the fault in the server.

Unlike in the case of an unexpected simply reply from a structured
server (where we never over-read, and can therefore commit b31e7bac
can merely fail the command with EPROTO and successfully move on to
the next server reply), in this case we really do have to move to
DEAD: in addition to having already read the 16 or 20 bytes that the
server sent in its (short) reply for this command, we may have already
read the initial bytes of the server's next reply, but we have no way
to push those extra bytes back onto our read stream for parsing on our
next pass through the state machine.
---
 generator/states-reply.c | 18 +-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/generator/states-reply.c b/generator/states-reply.c
index dde23b39..e89e9019 100644
--- a/generator/states-reply.c
+++ b/generator/states-reply.c
@@ -109,7 +109,23 @@  REPLY.START:
  REPLY.RECV_REPLY:
   switch (recv_into_rbuf (h)) {
   case -1: SET_NEXT_STATE (%.DEAD); return 0;
-  case 1: SET_NEXT_STATE (%.READY); return 0;
+  case 1: SET_NEXT_STATE (%.READY);
+/* Special case: if we have a short read, but got at least far
+ * enough to decode the magic number, we can check if the server
+ * is matching our expectations. This lets us avoid deadlocking if
+ * a buggy server sends only 16 bytes of a simple reply, and is
+ * waiting for our next command, while we are blocked waiting for
+ * the server to send 32 bytes of an extended reply.
+ */
+if (h->extended_headers &&
+(char *) h->rbuf >= (char *) &h->sbuf.reply.hdr.extended.flags) {
+  uint32_t magic = be32toh (h->sbuf.reply.hdr.extended.magic);
+  if (magic != NBD_EXTENDED_REPLY_MAGIC) {
+SET_NEXT_STATE (%.DEAD); /* We've probably lost synchronization. */
+set_error (0, "invalid or unexpected reply magic 0x%" PRIx32, magic);
+  }
+}
+return 0;
   case 0: SET_NEXT_STATE (%CHECK_SIMPLE_OR_STRUCTURED_REPLY);
   }
   return 0;
-- 
2.38.1

[PATCH 2/2] Do not access /dev/mem in MSI-X PCI passthrough on Xen

2022-11-14 Thread Marek Marczykowski-Górecki

The /dev/mem is used for two purposes:
 - reading PCI_MSIX_ENTRY_CTRL_MASKBIT
 - reading Pending Bit Array (PBA)

The first one was originally done because when Xen did not send all
vector ctrl writes to the device model, so QEMU might have outdated old
register value. This has been changed in Xen, so QEMU can now use its
cached value of the register instead.

The Pending Bit Array (PBA) handling is for the case where it lives on
the same page as the MSI-X table itself. Xen has been extended to handle
this case too (as well as other registers that may live on those pages),
so QEMU handling is not necessary anymore.

Removing /dev/mem access is useful to work within stubdomain, and
necessary when dom0 kernel runs in lockdown mode.

Signed-off-by: Marek Marczykowski-Górecki 
---
 hw/xen/xen_pt.h |  1 -
 hw/xen/xen_pt_msi.c | 51 -
 2 files changed, 4 insertions(+), 48 deletions(-)

diff --git a/hw/xen/xen_pt.h b/hw/xen/xen_pt.h
index e7c4316a7d..de4094e7ec 100644
--- a/hw/xen/xen_pt.h
+++ b/hw/xen/xen_pt.h
@@ -206,7 +206,6 @@ typedef struct XenPTMSIX {
 uint32_t table_offset_adjust; /* page align mmap */
 uint64_t mmio_base_addr;
 MemoryRegion mmio;
-void *phys_iomem_base;
 XenPTMSIXEntry msix_entry[];
 } XenPTMSIX;
 
diff --git a/hw/xen/xen_pt_msi.c b/hw/xen/xen_pt_msi.c
index b71563f98a..a8a75dff66 100644
--- a/hw/xen/xen_pt_msi.c
+++ b/hw/xen/xen_pt_msi.c
@@ -460,15 +460,7 @@ static void pci_msix_write(void *opaque, hwaddr addr,
 entry->updated = true;
 } else if (msix->enabled && entry->updated &&
!(val & PCI_MSIX_ENTRY_CTRL_MASKBIT)) {
-const volatile uint32_t *vec_ctrl;
-
-/*
- * If Xen intercepts the mask bit access, entry->vec_ctrl may not be
- * up-to-date. Read from hardware directly.
- */
-vec_ctrl = s->msix->phys_iomem_base + entry_nr * PCI_MSIX_ENTRY_SIZE
-+ PCI_MSIX_ENTRY_VECTOR_CTRL;
-xen_pt_msix_update_one(s, entry_nr, *vec_ctrl);
+xen_pt_msix_update_one(s, entry_nr, entry->latch(VECTOR_CTRL));
 }
 
 set_entry_value(entry, offset, val);
@@ -493,7 +485,9 @@ static uint64_t pci_msix_read(void *opaque, hwaddr addr,
 return get_entry_value(&msix->msix_entry[entry_nr], offset);
 } else {
 /* Pending Bit Array (PBA) */
-return *(uint32_t *)(msix->phys_iomem_base + addr);
+XEN_PT_LOG(&s->dev, "reading PBA, addr %#lx, offset %#lx\n",
+   addr, addr - msix->total_entries * PCI_MSIX_ENTRY_SIZE);
+return 0x;
 }
 }
 
@@ -529,7 +523,6 @@ int xen_pt_msix_init(XenPCIPassthroughState *s, uint32_t 
base)
 int i, total_entries, bar_index;
 XenHostPCIDevice *hd = &s->real_device;
 PCIDevice *d = &s->dev;
-int fd = -1;
 XenPTMSIX *msix = NULL;
 int rc = 0;
 
@@ -576,34 +569,6 @@ int xen_pt_msix_init(XenPCIPassthroughState *s, uint32_t 
base)
 msix->table_base = s->real_device.io_regions[bar_index].base_addr;
 XEN_PT_LOG(d, "get MSI-X table BAR base 0x%"PRIx64"\n", msix->table_base);
 
-fd = open("/dev/mem", O_RDWR);
-if (fd == -1) {
-rc = -errno;
-XEN_PT_ERR(d, "Can't open /dev/mem: %s\n", strerror(errno));
-goto error_out;
-}
-XEN_PT_LOG(d, "table_off = 0x%x, total_entries = %d\n",
-   table_off, total_entries);
-msix->table_offset_adjust = table_off & 0x0fff;
-msix->phys_iomem_base =
-mmap(NULL,
- total_entries * PCI_MSIX_ENTRY_SIZE + msix->table_offset_adjust,
- PROT_READ,
- MAP_SHARED | MAP_LOCKED,
- fd,
- msix->table_base + table_off - msix->table_offset_adjust);
-close(fd);
-if (msix->phys_iomem_base == MAP_FAILED) {
-rc = -errno;
-XEN_PT_ERR(d, "Can't map physical MSI-X table: %s\n", strerror(errno));
-goto error_out;
-}
-msix->phys_iomem_base = (char *)msix->phys_iomem_base
-+ msix->table_offset_adjust;
-
-XEN_PT_LOG(d, "mapping physical MSI-X table to %p\n",
-   msix->phys_iomem_base);
-
 memory_region_add_subregion_overlap(&s->bar[bar_index], table_off,
 &msix->mmio,
 2); /* Priority: pci default + 1 */
@@ -624,14 +589,6 @@ void xen_pt_msix_unmap(XenPCIPassthroughState *s)
 return;
 }
 
-/* unmap the MSI-X memory mapped register area */
-if (msix->phys_iomem_base) {
-XEN_PT_LOG(&s->dev, "unmapping physical MSI-X table from %p\n",
-   msix->phys_iomem_base);
-munmap(msix->phys_iomem_base, msix->total_entries * PCI_MSIX_ENTRY_SIZE
-   + msix->table_offset_adjust);
-}
-
 memory_region_del_subregion(&s->bar[msix->bar_index], &msix->mmio);
 }
 
-- 
2.37.3

Re: [PATCH] s390x: Fix spelling errors

2022-11-14 Thread Thomas Huth


On 11/11/2022 19.38, Stefan Weil wrote:

Am 11.11.22 um 19:28 schrieb Thomas Huth:


Fix typos (discovered with the 'codespell' utility).

Signed-off-by: Thomas Huth 
---
  hw/s390x/ipl.h  | 2 +-
  pc-bios/s390-ccw/cio.h  | 2 +-
  pc-bios/s390-ccw/iplb.h | 2 +-
  target/s390x/cpu_models.h   | 4 ++--
  hw/s390x/s390-pci-vfio.c    | 2 +-
  hw/s390x/s390-virtio-ccw.c  | 6 +++---
  target/s390x/ioinst.c   | 2 +-
  target/s390x/tcg/excp_helper.c  | 2 +-
  target/s390x/tcg/fpu_helper.c   | 2 +-
  target/s390x/tcg/misc_helper.c  | 2 +-
  target/s390x/tcg/translate.c    | 4 ++--
  target/s390x/tcg/translate_vx.c.inc | 6 +++---
  pc-bios/s390-ccw/start.S    | 2 +-
  13 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/hw/s390x/ipl.h b/hw/s390x/ipl.h
index dfc6dfd89c..7fc86e7905 100644
--- a/hw/s390x/ipl.h
+++ b/hw/s390x/ipl.h
@@ -140,7 +140,7 @@ void s390_ipl_clear_reset_request(void);
   * have an offset of 4 + n * 8 bytes within the struct in order
   * to keep it double-word aligned.
   * The total size of the struct must never exceed 28 bytes.
- * This definition must be kept in sync with the defininition
+ * This definition must be kept in sync with the definition
   * in pc-bios/s390-ccw/iplb.h.
   */
  struct QemuIplParameters {
diff --git a/pc-bios/s390-ccw/cio.h b/pc-bios/s390-ccw/cio.h
index 1e5d4e92e1..88a88adfd2 100644
--- a/pc-bios/s390-ccw/cio.h
+++ b/pc-bios/s390-ccw/cio.h
@@ -20,7 +20,7 @@ struct pmcw {
  __u32 intparm;  /* interruption parameter */
  __u32 qf:1; /* qdio facility */
  __u32 w:1;
-    __u32 isc:3;    /* interruption sublass */
+    __u32 isc:3;    /* interruption subclass */
  __u32 res5:3;   /* reserved zeros */
  __u32 ena:1;    /* enabled */
  __u32 lm:2; /* limit mode */
diff --git a/pc-bios/s390-ccw/iplb.h b/pc-bios/s390-ccw/iplb.h
index 772d5c57c9..cb6ac8a880 100644
--- a/pc-bios/s390-ccw/iplb.h
+++ b/pc-bios/s390-ccw/iplb.h
@@ -81,7 +81,7 @@ extern IplParameterBlock iplb 
__attribute__((__aligned__(PAGE_SIZE)));

  #define QIPL_FLAG_BM_OPTS_ZIPL  0x40
  /*
- * This definition must be kept in sync with the defininition
+ * This definition must be kept in sync with the definition
   * in hw/s390x/ipl.h
   */
  struct QemuIplParameters {
diff --git a/target/s390x/cpu_models.h b/target/s390x/cpu_models.h
index 74d1f87e4f..15c0f0dcfe 100644
--- a/target/s390x/cpu_models.h
+++ b/target/s390x/cpu_models.h
@@ -24,13 +24,13 @@ struct S390CPUDef {
  uint8_t gen;    /* hw generation identification */
  uint16_t type;  /* cpu type identification */
  uint8_t ec_ga;  /* EC GA version (on which also the BC is 
based) */

-    uint8_t mha_pow;    /* Maximum Host Adress Power, mha = 2^pow-1 */
+    uint8_t mha_pow;    /* Maximum Host Address Power, mha = 2^pow-1 */



This comment could use lower case words.


I thought so, too, but I guess the author used capital letters on purpose to 
make sure to explain the "mha" acronym this way ...


Anyway, I don't mind, we can also switch to lower case here, if that's 
preferred.




diff --git a/hw/s390x/s390-pci-vfio.c b/hw/s390x/s390-pci-vfio.c
index 2aefa508a0..5f0adb0b4a 100644
--- a/hw/s390x/s390-pci-vfio.c
+++ b/hw/s390x/s390-pci-vfio.c
@@ -313,7 +313,7 @@ retry:
  /*
   * Get the host function handle from the vfio CLP capabilities chain.  
Returns

   * true if a fh value was placed into the provided buffer.  Returns false
- * if a fh could not be obtained (ioctl failed or capabilitiy version does
+ * if a fh could not be obtained (ioctl failed or capability version does
   * not include the fh)
   */
  bool s390_pci_get_host_fh(S390PCIBusDevice *pbdev, uint32_t *fh)
diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c
index 7d80bc1837..2e64ffab45 100644
--- a/hw/s390x/s390-virtio-ccw.c
+++ b/hw/s390x/s390-virtio-ccw.c
@@ -354,7 +354,7 @@ static int s390_machine_protect(S390CcwMachineState *ms)
  }
  error_setg(&pv_mig_blocker,
-   "protected VMs are currently not migrateable.");
+   "protected VMs are currently not migratable.");



This might again be a different British / American spelling.


Maybe ... I did some internet search but I did not found anything reliable 
apart from some few pages saying that "migrateable" is an alternative 
spelling for "migratable". Anyway, the latter seems much more common, so I 
think we should switch to it - also to silence codespell here in future runs.


 Thomas

[PATCH v2 13/15] nbd/server: Add FLAG_PAYLOAD support to CMD_BLOCK_STATUS

Allow a client to request a subset of negotiated meta contexts.  For
example, a client may ask to use a single connection to learn about
both block status and dirty bitmaps, but where the dirty bitmap
queries only need to be performed on a subset of the disk; forcing the
server to compute that information on block status queries in the rest
of the disk is wasted effort (both at the server, and on the amount of
traffic sent over the wire to be parsed and ignored by the client).

Qemu as an NBD client never requests to use more than one meta
context, so it has no need to use block status payloads.  Testing this
instead requires support from libnbd, which CAN access multiple meta
contexts in parallel from a single NBD connection; an interop test
submitted to the libnbd project at the same time as this patch
demonstrates the feature working, as well as testing some corner cases
(for example, when the payload length is longer than the export
length), although other corner cases (like passing the same id
duplicated) requires a protocol fuzzer because libnbd is not wired up
to break the protocol that badly.

This also includes tweaks to 'qemu-nbd --list' to show when a server
is advertising the capability, and to the testsuite to reflect the
addition to that output.

Signed-off-by: Eric Blake 
---
 docs/interop/nbd.txt  |   2 +-
 include/block/nbd.h   |  32 --
 nbd/server.c  | 106 +-
 qemu-nbd.c|   1 +
 nbd/trace-events  |   1 +
 tests/qemu-iotests/223.out|  12 +-
 tests/qemu-iotests/307.out|  10 +-
 .../tests/nbd-qemu-allocation.out |   2 +-
 8 files changed, 136 insertions(+), 30 deletions(-)

diff --git a/docs/interop/nbd.txt b/docs/interop/nbd.txt
index 988c072697..b7893043a3 100644
--- a/docs/interop/nbd.txt
+++ b/docs/interop/nbd.txt
@@ -69,4 +69,4 @@ NBD_CMD_BLOCK_STATUS for "qemu:dirty-bitmap:", NBD_CMD_CACHE
 NBD_CMD_FLAG_FAST_ZERO
 * 5.2: NBD_CMD_BLOCK_STATUS for "qemu:allocation-depth"
 * 7.1: NBD_FLAG_CAN_MULTI_CONN for shareable writable exports
-* 7.2: NBD_OPT_EXTENDED_HEADERS
+* 7.2: NBD_OPT_EXTENDED_HEADERS, NBD_FLAG_BLOCK_STATUS_PAYLOAD
diff --git a/include/block/nbd.h b/include/block/nbd.h
index 9a8ac1c8a5..2a65c606c9 100644
--- a/include/block/nbd.h
+++ b/include/block/nbd.h
@@ -167,6 +167,12 @@ typedef struct NBDExtentExt {
 uint64_t flags; /* NBD_STATE_* */
 } QEMU_PACKED NBDExtentExt;

+/* Client payload for limiting NBD_CMD_BLOCK_STATUS reply */
+typedef struct NBDBlockStatusPayload {
+uint64_t effect_length;
+/* uint32_t ids[] follows, array length implied by header */
+} QEMU_PACKED NBDBlockStatusPayload;
+
 /* Transmission (export) flags: sent from server to client during handshake,
but describe what will happen during transmission */
 enum {
@@ -183,20 +189,22 @@ enum {
 NBD_FLAG_SEND_RESIZE_BIT=  9, /* Send resize */
 NBD_FLAG_SEND_CACHE_BIT = 10, /* Send CACHE (prefetch) */
 NBD_FLAG_SEND_FAST_ZERO_BIT = 11, /* FAST_ZERO flag for WRITE_ZEROES */
+NBD_FLAG_BLOCK_STAT_PAYLOAD_BIT = 12, /* PAYLOAD flag for BLOCK_STATUS */
 };

-#define NBD_FLAG_HAS_FLAGS (1 << NBD_FLAG_HAS_FLAGS_BIT)
-#define NBD_FLAG_READ_ONLY (1 << NBD_FLAG_READ_ONLY_BIT)
-#define NBD_FLAG_SEND_FLUSH(1 << NBD_FLAG_SEND_FLUSH_BIT)
-#define NBD_FLAG_SEND_FUA  (1 << NBD_FLAG_SEND_FUA_BIT)
-#define NBD_FLAG_ROTATIONAL(1 << NBD_FLAG_ROTATIONAL_BIT)
-#define NBD_FLAG_SEND_TRIM (1 << NBD_FLAG_SEND_TRIM_BIT)
-#define NBD_FLAG_SEND_WRITE_ZEROES (1 << NBD_FLAG_SEND_WRITE_ZEROES_BIT)
-#define NBD_FLAG_SEND_DF   (1 << NBD_FLAG_SEND_DF_BIT)
-#define NBD_FLAG_CAN_MULTI_CONN(1 << NBD_FLAG_CAN_MULTI_CONN_BIT)
-#define NBD_FLAG_SEND_RESIZE   (1 << NBD_FLAG_SEND_RESIZE_BIT)
-#define NBD_FLAG_SEND_CACHE(1 << NBD_FLAG_SEND_CACHE_BIT)
-#define NBD_FLAG_SEND_FAST_ZERO(1 << NBD_FLAG_SEND_FAST_ZERO_BIT)
+#define NBD_FLAG_HAS_FLAGS  (1 << NBD_FLAG_HAS_FLAGS_BIT)
+#define NBD_FLAG_READ_ONLY  (1 << NBD_FLAG_READ_ONLY_BIT)
+#define NBD_FLAG_SEND_FLUSH (1 << NBD_FLAG_SEND_FLUSH_BIT)
+#define NBD_FLAG_SEND_FUA   (1 << NBD_FLAG_SEND_FUA_BIT)
+#define NBD_FLAG_ROTATIONAL (1 << NBD_FLAG_ROTATIONAL_BIT)
+#define NBD_FLAG_SEND_TRIM  (1 << NBD_FLAG_SEND_TRIM_BIT)
+#define NBD_FLAG_SEND_WRITE_ZEROES  (1 << NBD_FLAG_SEND_WRITE_ZEROES_BIT)
+#define NBD_FLAG_SEND_DF(1 << NBD_FLAG_SEND_DF_BIT)
+#define NBD_FLAG_CAN_MULTI_CONN (1 << NBD_FLAG_CAN_MULTI_CONN_BIT)
+#define NBD_FLAG_SEND_RESIZE(1 << NBD_FLAG_SEND_RESIZE_BIT)
+#define NBD_FLAG_SEND_CACHE (1 << NBD_FLAG_SEND_CACHE_BIT)
+#define NBD_FLAG_SEND_FAST_ZERO (1 << NBD_FLAG_SEND_FAST_ZERO_BIT)
+#define NBD_FLAG_BLOCK_STAT_PAYLOAD (1 << NBD_FLAG_BLOCK_STAT_PAYLOAD_BIT)

 /* New-style hand

Re: [PATCH v1 3/3] target/riscv: kvm: Support selecting VCPU extensions

2022-11-14 Thread Alistair Francis

On Thu, Oct 27, 2022 at 3:53 PM Mayuresh Chitale
 wrote:
>
> Set the state of each ISA extension on the vcpu depending on what
> is set in the CPU property and what is allowed by KVM for that extension.
>
> Signed-off-by: Mayuresh Chitale 

Reviewed-by: Alistair Francis 

Alistair

> ---
>  target/riscv/cpu.c   | 11 -
>  target/riscv/kvm.c   | 88 ++--
>  target/riscv/kvm_riscv.h |  2 +-
>  3 files changed, 87 insertions(+), 14 deletions(-)
>
> diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
> index 35320a8547..e52577d59d 100644
> --- a/target/riscv/cpu.c
> +++ b/target/riscv/cpu.c
> @@ -1191,10 +1191,19 @@ static void riscv_isa_string_ext(RISCVCPU *cpu, char 
> **isa_str)
>  {
>  char *old = *isa_str;
>  char *new = *isa_str;
> -int i;
> +int i, offset;
>
>  for (i = 0; i < ARRAY_SIZE(isa_edata_arr); i++) {
>  if (isa_ext_is_enabled(cpu, &isa_edata_arr[i])) {
> +offset = isa_edata_arr[i].ext_enable_offset;
> +if (kvm_enabled() && !kvm_riscv_ext_supported(offset)) {
> +#ifndef CONFIG_USER_ONLY
> +info_report("disabling %s extension for hart 0x%lx because "
> +"kvm does not support it", isa_edata_arr[i].name,
> +(unsigned long)cpu->env.mhartid);
> +#endif
> +continue;
> +}
>  if (isa_edata_arr[i].multi_letter) {
>  if (cpu->cfg.short_isa_string) {
>  continue;
> diff --git a/target/riscv/kvm.c b/target/riscv/kvm.c
> index 30f21453d6..ea0715c9e4 100644
> --- a/target/riscv/kvm.c
> +++ b/target/riscv/kvm.c
> @@ -42,6 +42,29 @@
>  #include "migration/migration.h"
>  #include "sysemu/runstate.h"
>
> +struct isa_ext_info {
> +const char *name;
> +target_ulong misa_bit;
> +int ext_enable_offset;
> +};
> +
> +#define ISA_EXT_DATA_ENTRY(_name, _bit, _prop) \
> +{#_name, _bit, offsetof(struct RISCVCPUConfig, _prop)}
> +
> +static const struct isa_ext_info isa_info_arr[] = {
> +ISA_EXT_DATA_ENTRY(a, RVA, ext_a),
> +ISA_EXT_DATA_ENTRY(c, RVC, ext_c),
> +ISA_EXT_DATA_ENTRY(d, RVD, ext_d),
> +ISA_EXT_DATA_ENTRY(f, RVF, ext_f),
> +ISA_EXT_DATA_ENTRY(h, RVH, ext_h),
> +ISA_EXT_DATA_ENTRY(i, RVI, ext_i),
> +ISA_EXT_DATA_ENTRY(m, RVM, ext_m),
> +ISA_EXT_DATA_ENTRY(svpbmt, 0, ext_svpbmt),
> +ISA_EXT_DATA_ENTRY(sstc, 0, ext_sstc),
> +ISA_EXT_DATA_ENTRY(svinval, 0, ext_svinval),
> +ISA_EXT_DATA_ENTRY(zihintpause, 0, ext_zihintpause),
> +};
> +
>  static uint64_t kvm_riscv_reg_id(CPURISCVState *env, uint64_t type,
>   uint64_t idx)
>  {
> @@ -394,25 +417,66 @@ void kvm_arch_init_irq_routing(KVMState *s)
>  {
>  }
>
> +bool kvm_riscv_ext_supported(int offset)
> +{
> +int i;
> +
> +for (i = 0; i < KVM_RISCV_ISA_EXT_MAX; ++i) {
> +if (isa_info_arr[i].ext_enable_offset == offset) {
> +return true;
> +}
> +}
> +return false;
> +}
> +
> +static void kvm_riscv_set_isa_ext(CPUState *cs, CPURISCVState *env)
> +{
> +RISCVCPU *cpu = RISCV_CPU(cs);
> +unsigned long isa_ext_out;
> +bool *ext_state;
> +uint64_t id;
> +int i, ret;
> +
> +env->misa_ext = 0;
> +for (i = 0; i < ARRAY_SIZE(isa_info_arr); i++) {
> +ext_state = (void *)&cpu->cfg + isa_info_arr[i].ext_enable_offset;
> +id = kvm_riscv_reg_id(env, KVM_REG_RISCV_ISA_EXT, i);
> +ret = kvm_get_one_reg(cs, id, &isa_ext_out);
> +if (ret) {
> +warn_report("Disabling ext %s due to failure.",
> +isa_info_arr[i].name);
> +*ext_state = false;
> +continue;
> +}
> +if (isa_ext_out != (*ext_state)) {
> +isa_ext_out = *ext_state;
> +ret = kvm_set_one_reg(cs, id, &isa_ext_out);
> +if (ret) {
> +warn_report("Could not %s ext %s.",
> +(isa_ext_out ? "enable" : "disable"),
> +isa_info_arr[i].name);
> +*ext_state = !isa_ext_out;
> +}
> +}
> +/*
> + * If the sigle letter extension is supported by KVM then set
> + * the corresponding misa bit for the guest vcpu.
> + */
> +if (isa_info_arr[i].misa_bit && (*ext_state)) {
> +env->misa_ext |= isa_info_arr[i].misa_bit;
> +}
> +}
> +}
> +
>  int kvm_arch_init_vcpu(CPUState *cs)
>  {
> -int ret = 0;
> -target_ulong isa;
>  RISCVCPU *cpu = RISCV_CPU(cs);
>  CPURISCVState *env = &cpu->env;
> -uint64_t id;
>
>  qemu_add_vm_change_state_handler(kvm_riscv_vm_state_change, cs);
>
> -id = kvm_riscv_reg_id(env, KVM_REG_RISCV_CONFIG,
> -  KVM_REG_RISCV_CONFIG_REG(isa));
> -ret = kvm_get_one_reg(cs, id, &isa);
> -if (ret) {
> -return ret;
> -}
> -env->misa_ext = isa;
> -
> -

Re: [PATCH v2 12/12] hw/intc: add implementation of GICD_IIDR to Arm GIC

On Fri, 11 Nov 2022 at 14:55, Alex Bennée  wrote:
>
> a66a24585f (hw/intc/arm_gic: Implement read of GICC_IIDR) implemented
> this for the CPU interface register. The fact we don't implement it
> shows up when running Xen with -d guest_error which is definitely
> wrong because the guest is perfectly entitled to read it.
>
> Lightly re-factor this region of registers and also add a comment to
> the function in case anyway was under the illusion we only return
> bytes from a function called readb.
>
> Signed-off-by: Alex Bennée 
>
> ---
> v2
>   - checkpatch fixes.
> ---
>  hw/intc/arm_gic.c | 44 ++--
>  1 file changed, 30 insertions(+), 14 deletions(-)
>
> diff --git a/hw/intc/arm_gic.c b/hw/intc/arm_gic.c
> index 492b2421ab..65b1ef7151 100644
> --- a/hw/intc/arm_gic.c
> +++ b/hw/intc/arm_gic.c
> @@ -941,6 +941,10 @@ static void gic_complete_irq(GICState *s, int cpu, int 
> irq, MemTxAttrs attrs)
>  gic_update(s);
>  }
>
> +/*
> + * Although this is named a byte read we don't always return bytes and
> + * rely on the calling function oring bits together.
> + */

Rather than documenting this, maybe it would be better to
fix the weirdness? We only do this for exactly one register,
the GICD_TYPER. Everything else is naturally byte-based.
(The GICD_CTLR looks like it is also doing this, but the
only non-zero bits are in the low byte, so it isn't really.)

The GICD_TYPER returning bigger than a byte's worth of
data I think is a bug we introduced in commit 5543d1abb6e2
when we added the security extensions support -- before that
all the bits we needed to return were in the low byte. So
I think we can fix this with just (untested):

--- a/hw/intc/arm_gic.c
+++ b/hw/intc/arm_gic.c
@@ -955,6 +955,7 @@ static uint32_t gic_dist_readb(void *opaque,
hwaddr offset, MemTxAttrs attrs)
 cm = 1 << cpu;
 if (offset < 0x100) {
 if (offset == 0) {  /* GICD_CTLR */
+/* We rely here on the only non-zero bits being in byte 0 */
 if (s->security_extn && !attrs.secure) {
 /* The NS bank of this register is just an alias of the
  * EnableGrp1 bit in the S bank version.
@@ -964,11 +965,14 @@ static uint32_t gic_dist_readb(void *opaque,
hwaddr offset, MemTxAttrs attrs)
 return s->ctlr;
 }
 }
-if (offset == 4)
-/* Interrupt Controller Type Register */
-return ((s->num_irq / 32) - 1)
-| ((s->num_cpu - 1) << 5)
-| (s->security_extn << 10);
+if (offset == 4) {
+/* GICD_TYPER byte 0 */
+return ((s->num_irq / 32) - 1) | ((s->num_cpu - 1) << 5);
+}
+if (offset == 5) {
+/* GICD_TYPER byte 1 */
+return (s->security_extn << 2);
+}
 if (offset < 0x08)
 return 0;
 if (offset >= 0x80) {


(you can add my Signed-off-by: if you want to turn that into a proper patch.)

thanks
-- PMM

[PATCH v2 08/15] nbd/server: Support 64-bit block status

The previous patch handled extended headers by truncating large block
status requests from the client back to 32 bits.  But this is not
ideal; for cases where we can truly determine the status of the entire
image quickly (for example, when reporting the entire image as
non-sparse because we lack the ability to probe for holes), this
causes more network traffic for the client to iterate through 4G
chunks than for us to just report the entire image at once.  For ease
of implementation, if extended headers were negotiated, then we always
reply with 64-bit block status replies, even when the result could
have fit in the older 32-bit block status chunk (clients supporting
extended headers have to be prepared for either chunk type, so
temporarily reverting this patch proves whether a client is
compliant).

For now, all metacontexts that we know how to export never populate
more than 32 bits of information, so we don't have to worry about
NBD_REP_ERR_EXT_HEADER_REQD or filtering during handshake, and we
always send all zeroes for the upper 32 bits of status during
NBD_CMD_BLOCK_STATUS.

Note that we previously had some interesting size-juggling on call
chains, such as:

nbd_co_send_block_status(uint32_t length)
-> blockstatus_to_extents(uint32_t bytes)
  -> bdrv_block_status_above(bytes, &uint64_t num)
  -> nbd_extent_array_add(uint64_t num)
-> store num in 32-bit length

But we were lucky that it never overflowed: bdrv_block_status_above
never sets num larger than bytes, and we had previously been capping
'bytes' at 32 bits (either by the protocol, or in the previous patch
with an explicit truncation).  This patch adds some assertions that
ensure we continue to avoid overflowing 32 bits for a narrow client,
while fully utilizing 64-bits all the way through when the client
understands that.

Signed-off-by: Eric Blake 
---
 nbd/server.c | 86 +++-
 1 file changed, 59 insertions(+), 27 deletions(-)

diff --git a/nbd/server.c b/nbd/server.c
index b46655b4d8..f21f8098c1 100644
--- a/nbd/server.c
+++ b/nbd/server.c
@@ -2145,20 +2145,30 @@ static int coroutine_fn 
nbd_co_send_sparse_read(NBDClient *client,
 }

 typedef struct NBDExtentArray {
-NBDExtent *extents;
+union {
+NBDStructuredMeta id;
+NBDStructuredMetaExt meta;
+};
+union {
+NBDExtent *narrow;
+NBDExtentExt *extents;
+};
 unsigned int nb_alloc;
 unsigned int count;
 uint64_t total_length;
+bool extended; /* Whether 64-bit extents are allowed */
 bool can_add;
 bool converted_to_be;
 } NBDExtentArray;

-static NBDExtentArray *nbd_extent_array_new(unsigned int nb_alloc)
+static NBDExtentArray *nbd_extent_array_new(unsigned int nb_alloc,
+bool extended)
 {
 NBDExtentArray *ea = g_new0(NBDExtentArray, 1);

 ea->nb_alloc = nb_alloc;
-ea->extents = g_new(NBDExtent, nb_alloc);
+ea->extents = g_new(NBDExtentExt, nb_alloc);
+ea->extended = extended;
 ea->can_add = true;

 return ea;
@@ -2172,17 +2182,37 @@ static void nbd_extent_array_free(NBDExtentArray *ea)
 G_DEFINE_AUTOPTR_CLEANUP_FUNC(NBDExtentArray, nbd_extent_array_free)

 /* Further modifications of the array after conversion are abandoned */
-static void nbd_extent_array_convert_to_be(NBDExtentArray *ea)
+static void nbd_extent_array_convert_to_be(NBDExtentArray *ea,
+   uint32_t context_id,
+   struct iovec *iov)
 {
 int i;

 assert(!ea->converted_to_be);
+assert(iov[0].iov_base == &ea->meta);
+assert(iov[1].iov_base == ea->extents);
 ea->can_add = false;
 ea->converted_to_be = true;

-for (i = 0; i < ea->count; i++) {
-ea->extents[i].flags = cpu_to_be32(ea->extents[i].flags);
-ea->extents[i].length = cpu_to_be32(ea->extents[i].length);
+stl_be_p(&ea->meta.context_id, context_id);
+if (ea->extended) {
+stl_be_p(&ea->meta.count, ea->count);
+for (i = 0; i < ea->count; i++) {
+ea->extents[i].length = cpu_to_be64(ea->extents[i].length);
+ea->extents[i].flags = cpu_to_be64(ea->extents[i].flags);
+}
+iov[0].iov_len = sizeof(ea->meta);
+iov[1].iov_len = ea->count * sizeof(ea->extents[0]);
+} else {
+/* Conversion reduces memory usage, order of iteration matters */
+for (i = 0; i < ea->count; i++) {
+assert(ea->extents[i].length <= UINT32_MAX);
+assert((uint32_t) ea->extents[i].flags == ea->extents[i].flags);
+ea->narrow[i].length = cpu_to_be32(ea->extents[i].length);
+ea->narrow[i].flags = cpu_to_be32(ea->extents[i].flags);
+}
+iov[0].iov_len = sizeof(ea->id);
+iov[1].iov_len = ea->count * sizeof(ea->narrow[0]);
 }
 }

@@ -2196,19 +2226,23 @@ static void 
nbd_extent_array_convert_to_be(NBDExtentArray *ea)
  * would result in an inc

[PATCH v2 12/15] nbd/server: Prepare for per-request filtering of BLOCK_STATUS

The next commit will add support for the new addition of
NBD_CMD_FLAG_PAYLOAD during NBD_CMD_BLOCK_STATUS, where the client can
request that the server only return a subset of negotiated contexts,
rather than all contexts.  To make that task easier, this patch
populates the list of contexts to return on a per-command basis (for
now, identical to the full set of negotiated contexts).

Signed-off-by: Eric Blake 
---
 include/block/nbd.h |  20 +++-
 nbd/server.c| 108 +++-
 2 files changed, 75 insertions(+), 53 deletions(-)

diff --git a/include/block/nbd.h b/include/block/nbd.h
index 02e31b2261..9a8ac1c8a5 100644
--- a/include/block/nbd.h
+++ b/include/block/nbd.h
@@ -50,8 +50,23 @@ typedef struct NBDOptionReplyMetaContext {
 /* metadata context name follows */
 } QEMU_PACKED NBDOptionReplyMetaContext;

-/* Transmission phase structs
- *
+/* Transmission phase structs */
+
+/*
+ * NBDMetaContexts represents a list of meta contexts in use, as
+ * selected by NBD_OPT_SET_META_CONTEXT. Also used for
+ * NBD_OPT_LIST_META_CONTEXT, and payload filtering in
+ * NBD_CMD_BLOCK_STATUS.
+ */
+typedef struct NBDMetaContexts {
+size_t count; /* number of negotiated contexts */
+bool base_allocation; /* export base:allocation context (block status) */
+bool allocation_depth; /* export qemu:allocation-depth */
+size_t nr_bitmaps; /* Length of bitmaps array */
+bool *bitmaps; /* export qemu:dirty-bitmap: */
+} NBDMetaContexts;
+
+/*
  * Note: NBDRequest is _NOT_ the same as the network representation of an NBD
  * request!
  */
@@ -61,6 +76,7 @@ typedef struct NBDRequest {
 uint64_t len;   /* Effect length; 32 bit limit without extended headers */
 uint16_t flags; /* NBD_CMD_FLAG_* */
 uint16_t type;  /* NBD_CMD_* */
+NBDMetaContexts contexts; /* Used by NBD_CMD_BLOCK_STATUS */
 } NBDRequest;

 typedef struct NBDSimpleReply {
diff --git a/nbd/server.c b/nbd/server.c
index f21f8098c1..1fd1f32028 100644
--- a/nbd/server.c
+++ b/nbd/server.c
@@ -103,20 +103,6 @@ struct NBDExport {

 static QTAILQ_HEAD(, NBDExport) exports = QTAILQ_HEAD_INITIALIZER(exports);

-/* NBDExportMetaContexts represents a list of contexts to be exported,
- * as selected by NBD_OPT_SET_META_CONTEXT. Also used for
- * NBD_OPT_LIST_META_CONTEXT. */
-typedef struct NBDExportMetaContexts {
-NBDExport *exp;
-size_t count; /* number of negotiated contexts */
-bool base_allocation; /* export base:allocation context (block status) */
-bool allocation_depth; /* export qemu:allocation-depth */
-bool *bitmaps; /*
-* export qemu:dirty-bitmap:,
-* sized by exp->nr_export_bitmaps
-*/
-} NBDExportMetaContexts;
-
 struct NBDClient {
 int refcount;
 void (*close_fn)(NBDClient *client, bool negotiated);
@@ -143,7 +129,8 @@ struct NBDClient {

 bool structured_reply; /* also set true if extended_headers is set */
 bool extended_headers;
-NBDExportMetaContexts export_meta;
+NBDExport *context_exp; /* export of last OPT_SET_META_CONTEXT */
+NBDMetaContexts contexts; /* Negotiated meta contexts */

 uint32_t opt; /* Current option being negotiated */
 uint32_t optlen; /* remaining length of data in ioc for the option being
@@ -456,8 +443,8 @@ static int nbd_negotiate_handle_list(NBDClient *client, 
Error **errp)

 static void nbd_check_meta_export(NBDClient *client)
 {
-if (client->exp != client->export_meta.exp) {
-client->export_meta.count = 0;
+if (client->exp != client->context_exp) {
+client->contexts.count = 0;
 }
 }

@@ -847,7 +834,7 @@ static bool nbd_strshift(const char **str, const char 
*prefix)
  * Handle queries to 'base' namespace. For now, only the base:allocation
  * context is available.  Return true if @query has been handled.
  */
-static bool nbd_meta_base_query(NBDClient *client, NBDExportMetaContexts *meta,
+static bool nbd_meta_base_query(NBDClient *client, NBDMetaContexts *meta,
 const char *query)
 {
 if (!nbd_strshift(&query, "base:")) {
@@ -867,8 +854,8 @@ static bool nbd_meta_base_query(NBDClient *client, 
NBDExportMetaContexts *meta,
  * and qemu:allocation-depth contexts are available.  Return true if @query
  * has been handled.
  */
-static bool nbd_meta_qemu_query(NBDClient *client, NBDExportMetaContexts *meta,
-const char *query)
+static bool nbd_meta_qemu_query(NBDClient *client, NBDExport *exp,
+NBDMetaContexts *meta, const char *query)
 {
 size_t i;

@@ -879,9 +866,9 @@ static bool nbd_meta_qemu_query(NBDClient *client, 
NBDExportMetaContexts *meta,

 if (!*query) {
 if (client->opt == NBD_OPT_LIST_META_CONTEXT) {
-meta->allocation_depth = meta->exp->allocation_depth;
-if (meta->exp->nr_export_bitmaps) {
-memset(meta->bitmaps, 1, meta->exp->nr_export_bit

Re: [PATCH v6 09/10] vdpa: Add listener_shadow_vq to vhost_vdpa

2022-11-14 Thread Jason Wang




在 2022/11/11 21:12, Eugenio Perez Martin 写道:

On Fri, Nov 11, 2022 at 8:49 AM Jason Wang  wrote:


在 2022/11/10 21:47, Eugenio Perez Martin 写道:

On Thu, Nov 10, 2022 at 7:01 AM Jason Wang  wrote:

On Wed, Nov 9, 2022 at 1:08 AM Eugenio Pérez  wrote:

The memory listener that thells the device how to convert GPA to qemu's
va is registered against CVQ vhost_vdpa. This series try to map the
memory listener translations to ASID 0, while it maps the CVQ ones to
ASID 1.

Let's tell the listener if it needs to register them on iova tree or
not.

Signed-off-by: Eugenio Pérez 
---
v5: Solve conflict about vhost_iova_tree_remove accepting mem_region by
  value.
---
   include/hw/virtio/vhost-vdpa.h | 2 ++
   hw/virtio/vhost-vdpa.c | 6 +++---
   net/vhost-vdpa.c   | 1 +
   3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
index 6560bb9d78..0c3ed2d69b 100644
--- a/include/hw/virtio/vhost-vdpa.h
+++ b/include/hw/virtio/vhost-vdpa.h
@@ -34,6 +34,8 @@ typedef struct vhost_vdpa {
   struct vhost_vdpa_iova_range iova_range;
   uint64_t acked_features;
   bool shadow_vqs_enabled;
+/* The listener must send iova tree addresses, not GPA */


Btw, cindy's vIOMMU series will make it not necessarily GPA any more.


Yes, this comment should be tuned then. But the SVQ iova_tree will not
be equal to vIOMMU one because shadow vrings.

But maybe SVQ can inspect both instead of having all the duplicated entries.


+bool listener_shadow_vq;
   /* IOVA mapping used by the Shadow Virtqueue */
   VhostIOVATree *iova_tree;
   GPtrArray *shadow_vqs;
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index 8fd32ba32b..e3914fa40e 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -220,7 +220,7 @@ static void vhost_vdpa_listener_region_add(MemoryListener 
*listener,
vaddr, section->readonly);

   llsize = int128_sub(llend, int128_make64(iova));
-if (v->shadow_vqs_enabled) {
+if (v->listener_shadow_vq) {
   int r;

   mem_region.translated_addr = (hwaddr)(uintptr_t)vaddr,
@@ -247,7 +247,7 @@ static void vhost_vdpa_listener_region_add(MemoryListener 
*listener,
   return;

   fail_map:
-if (v->shadow_vqs_enabled) {
+if (v->listener_shadow_vq) {
   vhost_iova_tree_remove(v->iova_tree, mem_region);
   }

@@ -292,7 +292,7 @@ static void vhost_vdpa_listener_region_del(MemoryListener 
*listener,

   llsize = int128_sub(llend, int128_make64(iova));

-if (v->shadow_vqs_enabled) {
+if (v->listener_shadow_vq) {
   const DMAMap *result;
   const void *vaddr = memory_region_get_ram_ptr(section->mr) +
   section->offset_within_region +
diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
index 85a318faca..02780ee37b 100644
--- a/net/vhost-vdpa.c
+++ b/net/vhost-vdpa.c
@@ -570,6 +570,7 @@ static NetClientState *net_vhost_vdpa_init(NetClientState 
*peer,
   s->vhost_vdpa.index = queue_pair_index;
   s->always_svq = svq;
   s->vhost_vdpa.shadow_vqs_enabled = svq;
+s->vhost_vdpa.listener_shadow_vq = svq;

Any chance those above two can differ?


If CVQ is shadowed but data VQs are not, shadow_vqs_enabled is true
but listener_shadow_vq is not.

It is more clear in the next commit, where only shadow_vqs_enabled is
set to true at vhost_vdpa_net_cvq_start.


Ok, the name looks a little bit confusing. I wonder if it's better to
use shadow_cvq and shadow_data ?


I'm ok with renaming it, but struct vhost_vdpa is generic across all
kind of devices, and it does not know if it is a datapath or not for
the moment.

Maybe listener_uses_iova_tree?



I think "iova_tree" is something that is internal to svq implementation, 
it's better to define the name from the view of vhost_vdpa level.


Thanks




Thanks!

Re: [PATCH for 8.0 8/8] hmp: add cryptodev info command

2022-11-14 Thread Dr. David Alan Gilbert

* zhenwei pi (pizhen...@bytedance.com) wrote:
> Example of this command:
>  # virsh qemu-monitor-command vm --hmp info cryptodev
> cryptodev1: service=[akcipher|mac|hash|cipher]
> queue 0: type=builtin
> cryptodev0: service=[akcipher]
> queue 0: type=lkcf
> 
> Signed-off-by: zhenwei pi 
> ---
>  hmp-commands-info.hx  | 14 ++
>  include/monitor/hmp.h |  1 +
>  monitor/hmp-cmds.c| 36 
>  3 files changed, 51 insertions(+)
> 
> diff --git a/hmp-commands-info.hx b/hmp-commands-info.hx
> index 754b1e8408..47d63d26db 100644
> --- a/hmp-commands-info.hx
> +++ b/hmp-commands-info.hx
> @@ -993,3 +993,17 @@ SRST
>``info virtio-queue-element`` *path* *queue* [*index*]
>  Display element of a given virtio queue
>  ERST
> +
> +{
> +.name   = "cryptodev",
> +.args_type  = "",
> +.params = "",
> +.help   = "show the crypto devices",
> +.cmd= hmp_info_cryptodev,
> +.flags  = "p",
> +},
> +
> +SRST
> +  ``info cryptodev``
> +Show the crypto devices.
> +ERST
> diff --git a/include/monitor/hmp.h b/include/monitor/hmp.h
> index dfbc0c9a2f..b6b2b49202 100644
> --- a/include/monitor/hmp.h
> +++ b/include/monitor/hmp.h
> @@ -143,5 +143,6 @@ void hmp_info_vcpu_dirty_limit(Monitor *mon, const QDict 
> *qdict);
>  void hmp_human_readable_text_helper(Monitor *mon,
>  HumanReadableText *(*qmp_handler)(Error 
> **));
>  void hmp_info_stats(Monitor *mon, const QDict *qdict);
> +void hmp_info_cryptodev(Monitor *mon, const QDict *qdict);
>  
>  #endif
> diff --git a/monitor/hmp-cmds.c b/monitor/hmp-cmds.c
> index 01b789a79e..3f1054aa1e 100644
> --- a/monitor/hmp-cmds.c
> +++ b/monitor/hmp-cmds.c
> @@ -33,6 +33,7 @@
>  #include "qapi/qapi-commands-block.h"
>  #include "qapi/qapi-commands-char.h"
>  #include "qapi/qapi-commands-control.h"
> +#include "qapi/qapi-commands-cryptodev.h"
>  #include "qapi/qapi-commands-machine.h"
>  #include "qapi/qapi-commands-migration.h"
>  #include "qapi/qapi-commands-misc.h"
> @@ -2761,3 +2762,38 @@ void hmp_virtio_queue_element(Monitor *mon, const 
> QDict *qdict)
>  
>  qapi_free_VirtioQueueElement(e);
>  }
> +
> +void hmp_info_cryptodev(Monitor *mon, const QDict *qdict)
> +{
> +CryptodevInfoList *info_list;
> +CryptodevInfo *info;
> +QCryptodevBackendServiceTypeList *service_list;
> +CryptodevBackendClientList *client_list;
> +CryptodevBackendClient *client;
> +char services[128] = {};

I'd rather avoid magic length buffers; the magic is always the wrong
number!

> +int len;
> +
> +info_list = qmp_query_cryptodev(NULL);
> +for ( ; info_list; info_list = info_list->next) {

maybe:
 for ( info_list = qmp_query_cryptodev(NULL);
   info_list;
   info_list = info_list->next) {

> +info = info_list->value;
> +
> +service_list = info->service;
> +for (len = 0; service_list; service_list = service_list->next) {
> +len += snprintf(services + len, sizeof(services) - len, "%s|",
> +QCryptodevBackendServiceType_str(service_list->value));

Consider using a dynamically allocated string and then just using
g_strconcat or g_strjoin() to glue them all together.

new_services = g_strjoin("|", services,  NULL);   ?
g_free(services);
services = new_services;

Maybe?


> +}
> +if (len) {
> +services[len - 1] = '\0'; /* strip last char '|' */
> +}
> +monitor_printf(mon, "%s: service=[%s]\n", info->id, services);
> +
> +client_list = info->client;
> +for ( ; client_list; client_list = client_list->next) {
> +client = client_list->value;
> +monitor_printf(mon, "queue %ld: type=%s\n", client->queue,
> +  QCryptodevBackendType_str(client->type));
> +}
> +}
> +
> +qapi_free_CryptodevInfoList(info_list);
> +}
> -- 
> 2.20.1
> 
-- 
Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK

Re: [PATCH v9 1/8] mm: Introduce memfd_restricted system call to create restricted user memory

2022-11-14 Thread Kirill A. Shutemov

On Mon, Nov 14, 2022 at 03:02:37PM +0100, Vlastimil Babka wrote:
> On 11/1/22 16:19, Michael Roth wrote:
> > On Tue, Nov 01, 2022 at 07:37:29PM +0800, Chao Peng wrote:
> >> > 
> >> >   1) restoring kernel directmap:
> >> > 
> >> >  Currently SNP (and I believe TDX) need to either split or remove 
> >> > kernel
> >> >  direct mappings for restricted PFNs, since there is no guarantee 
> >> > that
> >> >  other PFNs within a 2MB range won't be used for non-restricted
> >> >  (which will cause an RMP #PF in the case of SNP since the 2MB
> >> >  mapping overlaps with guest-owned pages)
> >> 
> >> Has the splitting and restoring been a well-discussed direction? I'm
> >> just curious whether there is other options to solve this issue.
> > 
> > For SNP it's been discussed for quite some time, and either splitting or
> > removing private entries from directmap are the well-discussed way I'm
> > aware of to avoid RMP violations due to some other kernel process using
> > a 2MB mapping to access shared memory if there are private pages that
> > happen to be within that range.
> > 
> > In both cases the issue of how to restore directmap as 2M becomes a
> > problem.
> > 
> > I was also under the impression TDX had similar requirements. If so,
> > do you know what the plan is for handling this for TDX?
> > 
> > There are also 2 potential alternatives I'm aware of, but these haven't
> > been discussed in much detail AFAIK:
> > 
> > a) Ensure confidential guests are backed by 2MB pages. shmem has a way to
> >request 2MB THP pages, but I'm not sure how reliably we can guarantee
> >that enough THPs are available, so if we went that route we'd probably
> >be better off requiring the use of hugetlbfs as the backing store. But
> >obviously that's a bit limiting and it would be nice to have the option
> >of using normal pages as well. One nice thing with invalidation
> >scheme proposed here is that this would "Just Work" if implement
> >hugetlbfs support, so an admin that doesn't want any directmap
> >splitting has this option available, otherwise it's done as a
> >best-effort.
> > 
> > b) Implement general support for restoring directmap as 2M even when
> >subpages might be in use by other kernel threads. This would be the
> >most flexible approach since it requires no special handling during
> >invalidations, but I think it's only possible if all the CPA
> >attributes for the 2M range are the same at the time the mapping is
> >restored/unsplit, so some potential locking issues there and still
> >chance for splitting directmap over time.
> 
> I've been hoping that
> 
> c) using a mechanism such as [1] [2] where the goal is to group together
> these small allocations that need to increase directmap granularity so
> maximum number of large mappings are preserved.

As I mentioned in the other thread the restricted memfd can be backed by
secretmem instead of plain memfd. It already handles directmap with care.

But I don't think it has to be part of initial restricted memfd
implementation. It is SEV-specific requirement and AMD folks can extend
implementation as needed later.

-- 
  Kiryl Shutsemau / Kirill A. Shutemov

Re: [PATCH v2 05/15] migration: Remove RAMState.f references in compression code

Peter Xu  wrote:
> Removing referencing to RAMState.f in compress_page_with_multi_thread() and
> flush_compressed_data().
>
> Compression code by default isn't compatible with having >1 channels (or it
> won't currently know which channel to flush the compressed data), so to
> make it simple we always flush on the default to_dst_file port until
> someone wants to add >1 ports support, as rs->f right now can really
> change (after postcopy preempt is introduced).
>
> There should be no functional change at all after patch applied, since as
> long as rs->f referenced in compression code, it must be to_dst_file.
>
> Reviewed-by: Dr. David Alan Gilbert 
> Signed-off-by: Peter Xu 

Reviewed-by: Juan Quintela

[PATCH v2 01/15] nbd/client: Add safety check on chunk payload length

Our existing use of structured replies either reads into a qiov capped
at 32M (NBD_CMD_READ) or caps allocation to 1000 bytes (see
NBD_MAX_MALLOC_PAYLOAD in block/nbd.c).  But the existing length
checks are rather late; if we encounter a buggy (or malicious) server
that sends a super-large payload length, we should drop the connection
right then rather than assuming the layer on top will be careful.
This becomes more important when we permit 64-bit lengths which are
even more likely to have the potential for attempted denial of service
abuse.

Signed-off-by: Eric Blake 
---
 nbd/client.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/nbd/client.c b/nbd/client.c
index 90a6b7b38b..cd97a2aa09 100644
--- a/nbd/client.c
+++ b/nbd/client.c
@@ -1412,6 +1412,18 @@ static int nbd_receive_structured_reply_chunk(QIOChannel 
*ioc,
 chunk->handle = be64_to_cpu(chunk->handle);
 chunk->length = be32_to_cpu(chunk->length);

+/*
+ * Because we use BLOCK_STATUS with REQ_ONE, and cap READ requests
+ * at 32M, no valid server should send us payload larger than
+ * this.  Even if we stopped using REQ_ONE, sane servers will cap
+ * the number of extents they return for block status.
+ */
+if (chunk->length > NBD_MAX_BUFFER_SIZE + sizeof(NBDStructuredReadData)) {
+error_setg(errp, "server chunk %" PRIu32 " (%s) payload is too long",
+   chunk->type, nbd_rep_lookup(chunk->type));
+return -EINVAL;
+}
+
 return 0;
 }

-- 
2.38.1

[PULL 0/2] target-arm queue for rc1

Hi; here's the arm pullreq for rc1. Just one bugfix and
a MAINTAINERS file update...

thanks
-- PMM

The following changes since commit 305f6f62d9d250a32cdf090ddcb7e3a5b26a342e:

  Merge tag 'pull-la-20221112' of https://gitlab.com/rth7680/qemu into staging 
(2022-11-12 09:17:06 -0500)

are available in the Git repository at:

  https://git.linaro.org/people/pmaydell/qemu-arm.git 
tags/pull-target-arm-20221114

for you to fetch changes up to d9721f19cd05a382f4f5a7093c80d1c4a8a1aa82:

  hw/intc/arm_gicv3: fix prio masking on pmr write (2022-11-14 15:10:58 +)


target-arm queue:
 * hw/intc/arm_gicv3: fix prio masking on pmr write
 * MAINTAINERS: Update maintainer's email for Xilinx CAN


Jens Wiklander (1):
  hw/intc/arm_gicv3: fix prio masking on pmr write

Vikram Garhwal (1):
  MAINTAINERS: Update maintainer's email for Xilinx CAN

 hw/intc/arm_gicv3_cpuif.c | 3 +--
 MAINTAINERS   | 4 ++--
 2 files changed, 3 insertions(+), 4 deletions(-)

[PATCH v2 05/15] nbd/server: Refactor handling of request payload

Upcoming additions to support NBD 64-bit effect lengths allow for the
possibility to distinguish between payload length (capped at 32M) and
effect length (up to 63 bits).  Without that extension, only the
NBD_CMD_WRITE request has a payload; but with the extension, it makes
sense to allow at least NBD_CMD_BLOCK_STATUS to have both a payload
and effect length (where the payload is a limited-size struct that in
turns gives the real effect length as well as a subset of known ids
for which status is requested).  Other future NBD commands may also
have a request payload, so the 64-bit extension introduces a new
NBD_CMD_FLAG_PAYLOAD_LEN that distinguishes between whether the header
length is a payload length or an effect length, rather than
hard-coding the decision based on the command.  Note that we do not
support the payload version of BLOCK_STATUS yet.

For this patch, no semantic change is intended for a compliant client.
For a non-compliant client, it is possible that the error behavior
changes (a different message, a change on whether the connection is
killed or remains alive for the next command, or so forth), but all
errors should still be handled gracefully.

Signed-off-by: Eric Blake 
---
 nbd/server.c | 53 
 nbd/trace-events |  1 +
 2 files changed, 37 insertions(+), 17 deletions(-)

diff --git a/nbd/server.c b/nbd/server.c
index 7738f5f899..ad5c2052b5 100644
--- a/nbd/server.c
+++ b/nbd/server.c
@@ -2316,6 +2316,8 @@ static int nbd_co_receive_request(NBDRequestData *req, 
NBDRequest *request,
   Error **errp)
 {
 NBDClient *client = req->client;
+bool extended_with_payload;
+int payload_len = 0;
 int valid_flags;
 int ret;

@@ -2329,27 +2331,40 @@ static int nbd_co_receive_request(NBDRequestData *req, 
NBDRequest *request,
 trace_nbd_co_receive_request_decode_type(request->handle, request->type,
  nbd_cmd_lookup(request->type));

-if (request->type != NBD_CMD_WRITE) {
-/* No payload, we are ready to read the next request.  */
-req->complete = true;
-}
-
 if (request->type == NBD_CMD_DISC) {
 /* Special case: we're going to disconnect without a reply,
  * whether or not flags, from, or len are bogus */
+req->complete = true;
 return -EIO;
 }

+/* Payload and buffer handling. */
+extended_with_payload = client->extended_headers &&
+(request->flags & NBD_CMD_FLAG_PAYLOAD_LEN);
 if (request->type == NBD_CMD_READ || request->type == NBD_CMD_WRITE ||
-request->type == NBD_CMD_CACHE)
-{
+request->type == NBD_CMD_CACHE || extended_with_payload) {
 if (request->len > NBD_MAX_BUFFER_SIZE) {
 error_setg(errp, "len (%" PRIu64" ) is larger than max len (%u)",
request->len, NBD_MAX_BUFFER_SIZE);
 return -EINVAL;
 }

-if (request->type != NBD_CMD_CACHE) {
+if (request->type == NBD_CMD_WRITE || extended_with_payload) {
+payload_len = request->len;
+if (request->type != NBD_CMD_WRITE) {
+/*
+ * For now, we don't support payloads on other
+ * commands; but we can keep the connection alive.
+ */
+request->len = 0;
+} else if (client->extended_headers && !extended_with_payload) {
+/* The client is noncompliant. Trace it, but proceed. */
+trace_nbd_co_receive_ext_payload_compliance(request->from,
+request->len);
+}
+}
+
+if (request->type == NBD_CMD_WRITE || request->type == NBD_CMD_READ) {
 req->data = blk_try_blockalign(client->exp->common.blk,
request->len);
 if (req->data == NULL) {
@@ -2359,18 +2374,20 @@ static int nbd_co_receive_request(NBDRequestData *req, 
NBDRequest *request,
 }
 }

-if (request->type == NBD_CMD_WRITE) {
-assert(request->len <= NBD_MAX_BUFFER_SIZE);
-if (nbd_read(client->ioc, req->data, request->len, "CMD_WRITE data",
- errp) < 0)
-{
+if (payload_len) {
+if (req->data) {
+ret = nbd_read(client->ioc, req->data, payload_len,
+   "CMD_WRITE data", errp);
+} else {
+ret = nbd_drop(client->ioc, payload_len, errp);
+}
+if (ret < 0) {
 return -EIO;
 }
-req->complete = true;
-
 trace_nbd_co_receive_request_payload_received(request->handle,
-  request->len);
+  payload_len);
 }
+req->complete = true;

 /* Sanity checks. */
 if (client->exp->nbdflags & NBD_FLAG_READ_ONLY &&
@@ -2400

Re: [PATCH] hw/intc/arm_gicv3: fix prio masking on pmr write