On macOS we need to increase unix socket buffers size on the client and server to get good performance. We set the socket buffers on macOS after connecting or accepting a client connection.
Testing with qemu-nbd shows that reading an image with qemu-img convert from qemu-nbd is *11.4 times faster* and qemu-img cpu usage is *8.3 times lower*. | qemu-img | qemu-nbd | time | user | system | |----------|----------|--------|--------|--------| | before | before | 12.957 | 2.643 | 5.777 | | after | before | 12.803 | 2.632 | 5.742 | | before | after | 1.139 | 0.074 | 0.905 | | after | after | 1.179 | 0.077 | 0.931 | For testing buffers size I built qemu-nbd and qemu-img with send buffer size from 64k to 2m. In this test 256k send buffer and 1m receive buffer are optimal. | send buffer | recv buffer | time | user | system | |-------------|-------------|--------|--------|--------| | 64k | 256k | 2.233 | 0.290 | 1.408 | | 128k | 512k | 1.189 | 0.103 | 0.841 | | 256k | 1024k | 1.121 | 0.085 | 0.813 | | 512k | 2048k | 1.172 | 0.081 | 0.953 | | 1024k | 4096k | 1.160 | 0.072 | 0.907 | | 2048k | 8192k | 1.309 | 0.056 | 0.960 | Using null-co driver is useful to focus on the read part, but in the real world we do something with the read data. I tested real world usage with nbdcopy and blksum. I tested computing a hash of the image using nbdcopy, using 4 NBD connections and 256k request size. In this test 1m send buffer size and 4m receive buffer size are optimal. | send buffer | recv buffer | time | user | system | |-------------|-------------|--------|--------|--------| | 64k | 256k | 2.832 | 4.866 | 2.550 | | 128k | 512k | 2.429 | 4.762 | 2.037 | | 256k | 1024k | 2.158 | 4.724 | 1.813 | | 512k | 2048k | 1.777 | 4.632 | 1.790 | | 1024k | 4096k | 1.657 | 4.466 | 1.812 | | 2048k | 8192k | 1.782 | 4.570 | 1.912 | I tested creating a hash of the image with blksum, using one NBD connection and 256k read size. In this test 2m send buffer and 8m receive buffer are optimal. | send buffer | recv buffer | time | user | system | |-------------|-------------|--------|--------|--------| | 64k | 256k | 4.233 | 5.242 | 2.632 | | 128k | 512k | 3.329 | 4.915 | 2.015 | | 256k | 1024k | 2.071 | 4.647 | 1.474 | | 512k | 2048k | 1.980 | 4.554 | 1.432 | | 1024k | 4096k | 2.058 | 4.553 | 1.497 | | 2048k | 8192k | 1.972 | 4.539 | 1.497 | In the real world tests larger buffers are optimal, so I picked send buffer of 1m and receive buffer of 4m. This will improve other usage of unix domain sockets on macOS. I tested only reading from qemu-nbd. The same change for libnbd: https://gitlab.com/nbdkit/libnbd/-/merge_requests/21 Signed-off-by: Nir Soffer <nir...@gmail.com> --- io/channel-socket.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) Changes since v1: - Add UNIX_SOCKET_*_BUFFER_SIZE macros (Philippe) - Handle both server and client sockets - Add qio_channel_socket_set_buffers() helper to cleaner code - Add tests results for qemu-img convert - Add tests results for different buffer sizes - Link to same change in libnbd v1 was here: https://lists.gnu.org/archive/html/qemu-devel/2025-04/msg03081.html diff --git a/io/channel-socket.c b/io/channel-socket.c index 608bcf066e..635c5c973d 100644 --- a/io/channel-socket.c +++ b/io/channel-socket.c @@ -21,6 +21,7 @@ #include "qapi/error.h" #include "qapi/qapi-visit-sockets.h" #include "qemu/module.h" +#include "qemu/units.h" #include "io/channel-socket.h" #include "io/channel-util.h" #include "io/channel-watch.h" @@ -37,6 +38,33 @@ #define SOCKET_MAX_FDS 16 +/* + * Apple recommends sizing the receive buffer at 4 times the size of the send + * buffer. Testing shows that 1m send buffer and 4 MiB receive buffer gives + * best throuput and lowest cpu usage. + */ +#ifdef __APPLE__ +#define UNIX_SOCKET_SEND_BUFFER_SIZE (1 * MiB) +#define UNIX_SOCKET_RECV_BUFFER_SIZE (4 * UNIX_SOCKET_SEND_BUFFER_SIZE) +#endif /* __APPLE__ */ + +static void qio_channel_socket_set_buffers(QIOChannelSocket *ioc) +{ +#ifdef __APPLE__ + if (ioc->localAddr.ss_family == AF_UNIX) { + int value; + + /* This is a performance optimization; don't fail on errors. */ + + value = UNIX_SOCKET_SEND_BUFFER_SIZE; + setsockopt(ioc->fd, SOL_SOCKET, SO_SNDBUF, &value, sizeof(value)); + + value = UNIX_SOCKET_RECV_BUFFER_SIZE; + setsockopt(ioc->fd, SOL_SOCKET, SO_RCVBUF, &value, sizeof(value)); + } +#endif /* __APPLE__ */ +} + SocketAddress * qio_channel_socket_get_local_address(QIOChannelSocket *ioc, Error **errp) @@ -174,6 +202,8 @@ int qio_channel_socket_connect_sync(QIOChannelSocket *ioc, } #endif + qio_channel_socket_set_buffers(ioc); + qio_channel_set_feature(QIO_CHANNEL(ioc), QIO_CHANNEL_FEATURE_READ_MSG_PEEK); @@ -410,6 +440,8 @@ qio_channel_socket_accept(QIOChannelSocket *ioc, } #endif /* WIN32 */ + qio_channel_socket_set_buffers(cioc); + qio_channel_set_feature(QIO_CHANNEL(cioc), QIO_CHANNEL_FEATURE_READ_MSG_PEEK); -- 2.39.5 (Apple Git-154)