Yet it seems to work fine in light testing.
-->
vsock/virtio: use DWARF ULEB128 to record EOM boundaries, enable cross-EOM skb
coalescing
virtio_transport_recv_enqueue() currently refuses to coalesce an
incoming skb with the previous one when the previous skb carries
VIRTIO_VSOCK_SEQ_EOM. This forces one skb per seqpacket message.
For workloads with many small or zero-byte messages the per-skb
overhead (~960 bytes) dominates, causing unbounded memory growth.
Decouple message boundary tracking from the skb structure: store
boundary offsets in a compact side buffer using DWARF ULEB128
encoding with the EOR flag folded into the low bit, then allow
the data of multiple complete messages to be coalesced into a single
skb.
Cross-EOM coalescing fires only when:
- both the tail skb and the incoming packet carry EOM (complete msgs)
- the incoming packet fits in the tail skb's tailroom
- no BPF psock is attached (read_skb expects one msg per skb)
On allocation failure the code falls back to separate skbs (existing
behaviour). Credit accounting is unchanged; the boundary buffer is
capped at PAGE_SIZE.
Signed-off-by: Michael S. Tsirkin <[email protected]>
Co-Authored-By: Claude Opus 4.6 (1M context) <[email protected]>
diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index f91704731057..e36b9ab28372 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -12,6 +12,7 @@
struct virtio_vsock_skb_cb {
bool reply;
bool tap_delivered;
+ bool has_boundary_entries;
u32 offset;
};
@@ -167,6 +168,12 @@ struct virtio_vsock_sock {
u32 buf_used;
struct sk_buff_head rx_queue;
u32 msg_count;
+
+ /* ULEB128-encoded seqpacket message boundary buffer */
+ u8 *boundary_buf;
+ u32 boundary_len;
+ u32 boundary_alloc;
+ u32 boundary_off;
};
struct virtio_vsock_pkt_info {
diff --git a/net/vmw_vsock/virtio_transport_common.c
b/net/vmw_vsock/virtio_transport_common.c
index 416d533f493d..81654f70f72c 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -11,6 +11,7 @@
#include <linux/sched/signal.h>
#include <linux/ctype.h>
#include <linux/list.h>
+#include <linux/skmsg.h>
#include <linux/virtio_vsock.h>
#include <uapi/linux/vsockmon.h>
@@ -26,6 +27,91 @@
/* Threshold for detecting small packets to copy */
#define GOOD_COPY_LEN 128
+#define VSOCK_BOUNDARY_BUF_INIT 64
+#define VSOCK_BOUNDARY_BUF_MAX PAGE_SIZE
+
+/* ULEB128 boundary encoding: value = (msg_len << 1) | eor.
+ * Each byte carries 7 data bits; bit 7 is set on all but the last byte.
+ * Max 5 bytes for a u32 msg_len (33 bits with eor shift).
+ */
+static int vsock_uleb_encode_boundary(u8 *buf, u32 msg_len, bool eor)
+{
+ u64 val = ((u64)msg_len << 1) | eor;
+ int n = 0;
+
+ do {
+ buf[n] = val & 0x7f;
+ val >>= 7;
+ if (val)
+ buf[n] |= 0x80;
+ n++;
+ } while (val);
+
+ return n;
+}
+
+static int vsock_uleb_decode_boundary(const u8 *buf, u32 avail,
+ u32 *msg_len, bool *eor)
+{
+ u64 val = 0;
+ int shift = 0;
+ int n = 0;
+
+ do {
+ if (n >= avail || shift >= 35)
+ return -EINVAL;
+ val |= (u64)(buf[n] & 0x7f) << shift;
+ shift += 7;
+ } while (buf[n++] & 0x80);
+
+ *eor = val & 1;
+ *msg_len = val >> 1;
+ return n;
+}
+
+static void vsock_boundary_buf_compact(struct virtio_vsock_sock *vvs)
+{
+ if (vvs->boundary_off == 0)
+ return;
+
+ vvs->boundary_len -= vvs->boundary_off;
+ memmove(vvs->boundary_buf, vvs->boundary_buf + vvs->boundary_off,
+ vvs->boundary_len);
+ vvs->boundary_off = 0;
+}
+
+static int vsock_boundary_buf_ensure(struct virtio_vsock_sock *vvs, u32 needed)
+{
+ u32 new_alloc;
+ u8 *new_buf;
+
+ if (vvs->boundary_alloc >= needed)
+ return 0;
+
+ /* Reclaim consumed space before growing */
+ if (vvs->boundary_off) {
+ needed -= vvs->boundary_off;
+ vsock_boundary_buf_compact(vvs);
+ if (vvs->boundary_alloc >= needed)
+ return 0;
+ }
+
+ new_alloc = max(needed, vvs->boundary_alloc ? vvs->boundary_alloc * 2
+ : VSOCK_BOUNDARY_BUF_INIT);
+ if (new_alloc > VSOCK_BOUNDARY_BUF_MAX)
+ new_alloc = VSOCK_BOUNDARY_BUF_MAX;
+ if (new_alloc < needed)
+ return -ENOMEM;
+
+ new_buf = krealloc(vvs->boundary_buf, new_alloc, GFP_ATOMIC);
+ if (!new_buf)
+ return -ENOMEM;
+
+ vvs->boundary_buf = new_buf;
+ vvs->boundary_alloc = new_alloc;
+ return 0;
+}
+
static void virtio_transport_cancel_close_work(struct vsock_sock *vsk,
bool cancel_timeout);
static s64 virtio_transport_has_space(struct virtio_vsock_sock *vvs);
@@ -682,41 +768,74 @@ virtio_transport_seqpacket_do_peek(struct vsock_sock *vsk,
total = 0;
len = msg_data_left(msg);
- skb_queue_walk(&vvs->rx_queue, skb) {
- struct virtio_vsock_hdr *hdr;
+ skb = skb_peek(&vvs->rx_queue);
+ if (skb && VIRTIO_VSOCK_SKB_CB(skb)->has_boundary_entries) {
+ u32 msg_len, offset;
+ size_t bytes;
+ bool eor;
+ int ret;
- if (total < len) {
- size_t bytes;
+ ret = vsock_uleb_decode_boundary(
+ vvs->boundary_buf + vvs->boundary_off,
+ vvs->boundary_len - vvs->boundary_off,
+ &msg_len, &eor);
+ if (ret < 0)
+ goto unlock;
+
+ offset = VIRTIO_VSOCK_SKB_CB(skb)->offset;
+ bytes = min(len, (size_t)msg_len);
+
+ if (bytes) {
int err;
- bytes = len - total;
- if (bytes > skb->len)
- bytes = skb->len;
-
spin_unlock_bh(&vvs->rx_lock);
-
- /* sk_lock is held by caller so no one else can dequeue.
- * Unlock rx_lock since skb_copy_datagram_iter() may
sleep.
- */
- err = skb_copy_datagram_iter(skb,
VIRTIO_VSOCK_SKB_CB(skb)->offset,
+ err = skb_copy_datagram_iter(skb, offset,
&msg->msg_iter, bytes);
if (err)
return err;
-
spin_lock_bh(&vvs->rx_lock);
}
- total += skb->len;
- hdr = virtio_vsock_hdr(skb);
+ total = msg_len;
+ if (eor)
+ msg->msg_flags |= MSG_EOR;
+ } else {
+ skb_queue_walk(&vvs->rx_queue, skb) {
+ struct virtio_vsock_hdr *hdr;
- if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOM) {
- if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOR)
- msg->msg_flags |= MSG_EOR;
+ if (total < len) {
+ size_t bytes;
+ int err;
- break;
+ bytes = len - total;
+ if (bytes > skb->len)
+ bytes = skb->len;
+
+ spin_unlock_bh(&vvs->rx_lock);
+
+ err = skb_copy_datagram_iter(
+ skb,
+ VIRTIO_VSOCK_SKB_CB(skb)->offset,
+ &msg->msg_iter, bytes);
+ if (err)
+ return err;
+
+ spin_lock_bh(&vvs->rx_lock);
+ }
+
+ total += skb->len;
+ hdr = virtio_vsock_hdr(skb);
+
+ if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOM) {
+ if (le32_to_cpu(hdr->flags) &
+ VIRTIO_VSOCK_SEQ_EOR)
+ msg->msg_flags |= MSG_EOR;
+ break;
+ }
}
}
+unlock:
spin_unlock_bh(&vvs->rx_lock);
return total;
@@ -740,57 +859,105 @@ static int virtio_transport_seqpacket_do_dequeue(struct
vsock_sock *vsk,
}
while (!msg_ready) {
- struct virtio_vsock_hdr *hdr;
- size_t pkt_len;
-
- skb = __skb_dequeue(&vvs->rx_queue);
+ skb = skb_peek(&vvs->rx_queue);
if (!skb)
break;
- hdr = virtio_vsock_hdr(skb);
- pkt_len = (size_t)le32_to_cpu(hdr->len);
- if (dequeued_len >= 0) {
+ if (VIRTIO_VSOCK_SKB_CB(skb)->has_boundary_entries) {
size_t bytes_to_copy;
+ u32 msg_len, offset;
+ bool eor;
+ int ret;
- bytes_to_copy = min(user_buf_len, pkt_len);
+ ret = vsock_uleb_decode_boundary(
+ vvs->boundary_buf + vvs->boundary_off,
+ vvs->boundary_len - vvs->boundary_off,
+ &msg_len, &eor);
+ if (ret < 0)
+ break;
+ vvs->boundary_off += ret;
- if (bytes_to_copy) {
+ offset = VIRTIO_VSOCK_SKB_CB(skb)->offset;
+ bytes_to_copy = min(user_buf_len, (size_t)msg_len);
+
+ if (bytes_to_copy && dequeued_len >= 0) {
int err;
- /* sk_lock is held by caller so no one else can
dequeue.
- * Unlock rx_lock since
skb_copy_datagram_iter() may sleep.
- */
spin_unlock_bh(&vvs->rx_lock);
-
- err = skb_copy_datagram_iter(skb, 0,
+ err = skb_copy_datagram_iter(skb, offset,
&msg->msg_iter,
bytes_to_copy);
- if (err) {
- /* Copy of message failed. Rest of
- * fragments will be freed without copy.
- */
- dequeued_len = err;
- } else {
- user_buf_len -= bytes_to_copy;
- }
-
spin_lock_bh(&vvs->rx_lock);
+ if (err)
+ dequeued_len = err;
+ else
+ user_buf_len -= bytes_to_copy;
}
if (dequeued_len >= 0)
- dequeued_len += pkt_len;
- }
+ dequeued_len += msg_len;
- if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOM) {
+ VIRTIO_VSOCK_SKB_CB(skb)->offset += msg_len;
msg_ready = true;
vvs->msg_count--;
- if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOR)
+ if (eor)
msg->msg_flags |= MSG_EOR;
- }
- virtio_transport_dec_rx_pkt(vvs, pkt_len, pkt_len);
- kfree_skb(skb);
+ virtio_transport_dec_rx_pkt(vvs, msg_len, msg_len);
+
+ if (VIRTIO_VSOCK_SKB_CB(skb)->offset >= skb->len) {
+ __skb_unlink(skb, &vvs->rx_queue);
+ kfree_skb(skb);
+ }
+
+ if (vvs->boundary_off >= vvs->boundary_len / 2)
+ vsock_boundary_buf_compact(vvs);
+ } else {
+ struct virtio_vsock_hdr *hdr;
+ size_t pkt_len;
+
+ skb = __skb_dequeue(&vvs->rx_queue);
+ if (!skb)
+ break;
+ hdr = virtio_vsock_hdr(skb);
+ pkt_len = (size_t)le32_to_cpu(hdr->len);
+
+ if (dequeued_len >= 0) {
+ size_t bytes_to_copy;
+
+ bytes_to_copy = min(user_buf_len, pkt_len);
+
+ if (bytes_to_copy) {
+ int err;
+
+ spin_unlock_bh(&vvs->rx_lock);
+ err = skb_copy_datagram_iter(
+ skb, 0, &msg->msg_iter,
+ bytes_to_copy);
+ if (err)
+ dequeued_len = err;
+ else
+ user_buf_len -= bytes_to_copy;
+ spin_lock_bh(&vvs->rx_lock);
+ }
+
+ if (dequeued_len >= 0)
+ dequeued_len += pkt_len;
+ }
+
+ if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOM) {
+ msg_ready = true;
+ vvs->msg_count--;
+
+ if (le32_to_cpu(hdr->flags) &
+ VIRTIO_VSOCK_SEQ_EOR)
+ msg->msg_flags |= MSG_EOR;
+ }
+
+ virtio_transport_dec_rx_pkt(vvs, pkt_len, pkt_len);
+ kfree_skb(skb);
+ }
}
spin_unlock_bh(&vvs->rx_lock);
@@ -1132,6 +1299,7 @@ void virtio_transport_destruct(struct vsock_sock *vsk)
virtio_transport_cancel_close_work(vsk, true);
+ kfree(vvs->boundary_buf);
kfree(vvs);
vsk->trans = NULL;
}
@@ -1224,6 +1392,11 @@ static void virtio_transport_remove_sock(struct
vsock_sock *vsk)
* removing it.
*/
__skb_queue_purge(&vvs->rx_queue);
+ kfree(vvs->boundary_buf);
+ vvs->boundary_buf = NULL;
+ vvs->boundary_len = 0;
+ vvs->boundary_alloc = 0;
+ vvs->boundary_off = 0;
vsock_remove_sock(vsk);
}
@@ -1395,23 +1568,62 @@ virtio_transport_recv_enqueue(struct vsock_sock *vsk,
!skb_is_nonlinear(skb)) {
struct virtio_vsock_hdr *last_hdr;
struct sk_buff *last_skb;
+ bool last_has_eom;
+ bool has_eom;
last_skb = skb_peek_tail(&vvs->rx_queue);
last_hdr = virtio_vsock_hdr(last_skb);
+ last_has_eom = le32_to_cpu(last_hdr->flags) &
VIRTIO_VSOCK_SEQ_EOM;
+ has_eom = le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOM;
- /* If there is space in the last packet queued, we copy the
- * new packet in its buffer. We avoid this if the last packet
- * queued has VIRTIO_VSOCK_SEQ_EOM set, because this is
- * delimiter of SEQPACKET message, so 'pkt' is the first packet
- * of a new message.
- */
- if (skb->len < skb_tailroom(last_skb) &&
- !(le32_to_cpu(last_hdr->flags) & VIRTIO_VSOCK_SEQ_EOM)) {
- memcpy(skb_put(last_skb, skb->len), skb->data,
skb->len);
- free_pkt = true;
- last_hdr->flags |= hdr->flags;
- le32_add_cpu(&last_hdr->len, len);
- goto out;
+ if (skb->len < skb_tailroom(last_skb)) {
+ if (!last_has_eom) {
+ /* Same-message coalescing (existing path) */
+ memcpy(skb_put(last_skb, skb->len),
+ skb->data, skb->len);
+ free_pkt = true;
+ last_hdr->flags |= hdr->flags;
+ le32_add_cpu(&last_hdr->len, len);
+ goto out;
+ }
+
+ /* Cross-EOM: coalesce complete messages into one skb,
+ * recording message boundaries in a compact BER buffer.
+ * Only when incoming packet also has EOM (complete
msg).
+ */
+ if (has_eom && !sk_psock(sk_vsock(vsk))) {
+ bool prev_eor, cur_eor;
+ u8 tmp[12];
+ int n = 0;
+
+ cur_eor = le32_to_cpu(hdr->flags) &
+ VIRTIO_VSOCK_SEQ_EOR;
+
+ if
(!VIRTIO_VSOCK_SKB_CB(last_skb)->has_boundary_entries) {
+ u32 prev_len =
le32_to_cpu(last_hdr->len);
+
+ prev_eor = le32_to_cpu(last_hdr->flags)
&
+ VIRTIO_VSOCK_SEQ_EOR;
+ n += vsock_uleb_encode_boundary(
+ tmp + n, prev_len, prev_eor);
+ }
+ n += vsock_uleb_encode_boundary(
+ tmp + n, len, cur_eor);
+
+ if (!vsock_boundary_buf_ensure(
+ vvs, vvs->boundary_len + n)) {
+ memcpy(vvs->boundary_buf +
+ vvs->boundary_len, tmp, n);
+ vvs->boundary_len += n;
+
VIRTIO_VSOCK_SKB_CB(last_skb)->has_boundary_entries = true;
+ memcpy(skb_put(last_skb, skb->len),
+ skb->data, skb->len);
+ free_pkt = true;
+ last_hdr->flags |= hdr->flags;
+ le32_add_cpu(&last_hdr->len, len);
+ goto out;
+ }
+ }
}
}