On 8/17/25 01:13, Brian Song wrote:
> 
> 
> On 8/14/25 11:46 PM, Brian Song wrote:
>> From: Brian Song <hibrians...@gmail.com>
>>
>> This patch adds a new export option for storage-export-daemon to enable
>> or disable FUSE-over-io_uring via the switch io-uring=on|off (disable
>> by default). It also implements the protocol handshake with the Linux
>> kernel during the FUSE-over-io_uring initialization phase.
>>
>> See: https://docs.kernel.org/filesystems/fuse-io-uring.html
>>
>> The kernel documentation describes in detail how FUSE-over-io_uring
>> works. This patch implements the Initial SQE stage shown in thediagram:
>> it initializes one queue per IOThread, each currently supporting a
>> single submission queue entry (SQE). When the FUSE driver sends the
>> first FUSE request (FUSE_INIT), storage-export-daemon calls
>> fuse_uring_start() to complete initialization, ultimately submitting
>> the SQE with the FUSE_IO_URING_CMD_REGISTER command to confirm
>> successful initialization with the kernel.
>>
>> Suggested-by: Kevin Wolf <kw...@redhat.com>
>> Suggested-by: Stefan Hajnoczi <stefa...@redhat.com>
>> Signed-off-by: Brian Song <hibrians...@gmail.com>
>> ---
>>   block/export/fuse.c                  | 161 ++++++++++++++++++++++++---
>>   docs/tools/qemu-storage-daemon.rst   |  11 +-
>>   qapi/block-export.json               |   5 +-
>>   storage-daemon/qemu-storage-daemon.c |   1 +
>>   util/fdmon-io_uring.c                |   5 +-
>>   5 files changed, 159 insertions(+), 24 deletions(-)
>>
>> diff --git a/block/export/fuse.c b/block/export/fuse.c
>> index c0ad4696ce..59fa79f486 100644
>> --- a/block/export/fuse.c
>> +++ b/block/export/fuse.c
>> @@ -48,6 +48,11 @@
>>   #include <linux/fs.h>
>>   #endif
>>
>> +#define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32
>> +
>> +/* room needed in buffer to accommodate header */
>> +#define FUSE_BUFFER_HEADER_SIZE 0x1000
>> +
>>   /* Prevent overly long bounce buffer allocations */
>>   #define FUSE_MAX_READ_BYTES (MIN(BDRV_REQUEST_MAX_BYTES, 1 * 1024 * 1024))
>>   /*
>> @@ -63,12 +68,31 @@
>>       (FUSE_MAX_WRITE_BYTES - FUSE_IN_PLACE_WRITE_BYTES)
>>
>>   typedef struct FuseExport FuseExport;
>> +typedef struct FuseQueue FuseQueue;
>> +
>> +typedef struct FuseRingEnt {
>> +    /* back pointer */
>> +    FuseQueue *q;
>> +
>> +    /* commit id of a fuse request */
>> +    uint64_t req_commit_id;
>> +
>> +    /* fuse request header and payload */
>> +    struct fuse_uring_req_header req_header;
>> +    void *op_payload;
>> +    size_t req_payload_sz;
>> +
>> +    /* The vector passed to the kernel */
>> +    struct iovec iov[2];
>> +
>> +    CqeHandler fuse_cqe_handler;
>> +} FuseRingEnt;
>>
>>   /*
>>    * One FUSE "queue", representing one FUSE FD from which requests are 
>> fetched
>>    * and processed.  Each queue is tied to an AioContext.
>>    */
>> -typedef struct FuseQueue {
>> +struct FuseQueue {
>>       FuseExport *exp;
>>
>>       AioContext *ctx;
>> @@ -109,7 +133,12 @@ typedef struct FuseQueue {
>>        * Free this buffer with qemu_vfree().
>>        */
>>       void *spillover_buf;
>> -} FuseQueue;
>> +
>> +#ifdef CONFIG_LINUX_IO_URING
>> +    int qid;
>> +    FuseRingEnt ent;
>> +#endif
>> +};
>>
>>   /*
>>    * Verify that FuseQueue.request_buf plus the spill-over buffer together
>> @@ -148,6 +177,7 @@ struct FuseExport {
>>       bool growable;
>>       /* Whether allow_other was used as a mount option or not */
>>       bool allow_other;
>> +    bool is_uring;
>>
>>       mode_t st_mode;
>>       uid_t st_uid;
>> @@ -257,6 +287,93 @@ static const BlockDevOps fuse_export_blk_dev_ops = {
>>       .drained_poll  = fuse_export_drained_poll,
>>   };
>>
>> +#ifdef CONFIG_LINUX_IO_URING
>> +
>> +static void fuse_uring_sqe_set_req_data(struct fuse_uring_cmd_req *req,
>> +                    const unsigned int qid,
>> +                    const unsigned int commit_id)
>> +{
>> +    req->qid = qid;
>> +    req->commit_id = commit_id;
>> +    req->flags = 0;
>> +}
>> +
>> +static void fuse_uring_sqe_prepare(struct io_uring_sqe *sqe, FuseQueue *q,
>> +               __u32 cmd_op)
>> +{
>> +    sqe->opcode = IORING_OP_URING_CMD;
>> +
>> +    sqe->fd = q->fuse_fd;
>> +    sqe->rw_flags = 0;
>> +    sqe->ioprio = 0;
>> +    sqe->off = 0;
>> +
>> +    sqe->cmd_op = cmd_op;
>> +    sqe->__pad1 = 0;
>> +}
>> +
>> +static void fuse_uring_prep_sqe_register(struct io_uring_sqe *sqe, void 
>> *opaque)
>> +{
>> +    FuseQueue *q = opaque;
>> +    struct fuse_uring_cmd_req *req = (void *)&sqe->cmd[0];
>> +
>> +    fuse_uring_sqe_prepare(sqe, q, FUSE_IO_URING_CMD_REGISTER);
>> +
>> +    sqe->addr = (uint64_t)(q->ent.iov);
>> +    sqe->len = 2;
>> +
>> +    fuse_uring_sqe_set_req_data(req, q->qid, 0);
>> +}
>> +
>> +static void fuse_uring_submit_register(void *opaque)
>> +{
>> +    FuseQueue *q = opaque;
>> +    FuseExport *exp = q->exp;
>> +
>> +
>> +    aio_add_sqe(fuse_uring_prep_sqe_register, q, 
>> &(q->ent.fuse_cqe_handler));
> 
> I think there might be a tricky issue with the io_uring integration in 
> QEMU. Currently, when the number of IOThreads goes above ~6 or 7, 
> there’s a pretty high chance of a hang. I added some debug logging in 
> the kernel’s fuse_uring_cmd() registration part, and noticed that the 
> number of register calls is less than the total number of entries in the 
> queue. In theory, we should be registering each entry for each queue.

Did you also try to add logging at the top of fuse_uring_cmd()? I wonder
if there is a start up race and if initial commands are just getting
refused. I had run into issues you are describing in some versions of 
the -rfc patches, but thought that everything was fixed for that. 
I.e. not excluded that there is still a kernel issue left.

Thanks,
Bernd



Reply via email to