On Fri, May 08, 2026 at 11:41:21AM +0200, Stefano Garzarella wrote:
> On Thu, May 07, 2026 at 06:48:47PM -0400, Michael S. Tsirkin wrote:
> > On Thu, May 07, 2026 at 02:59:13PM +0200, Stefano Garzarella wrote:
> > > On Thu, May 07, 2026 at 07:45:10AM -0400, Michael S. Tsirkin wrote:
> > > > On Thu, May 07, 2026 at 11:09:47AM +0200, Stefano Garzarella wrote:
> 
> [...]
> 
> > > > > For now, we're already doing something:
> > > > > merging the skuffs if they don't have EOM set.
> > > >
> > > >
> > > > Right that's good. You could go further and merge with EOM too
> > > > if you stick the info about message boundaries somewhere else.
> > > 
> > > This adds a lot of complexity IMO, but we can try.
> > > 
> > > Do you have something in mind?
> > 
> > BER is clearly overkill but here's a POC that claude made for me,
> > just to give u an idea. It's clearly has a ton of issues,
> > for example I dislike how GFP_ATOMIC is handled.
> 
> Okay, I somewhat understand, but clearly this isn't net material

I doubt we have many other options given reverting the regression was
ruled out.


> so for now
> I think the best thing to do is to merge the fixup I sent (or something
> similar):
> https://lore.kernel.org/netdev/[email protected]/

I reviewed that one, problem is it's a spec violation/change that we'll
have to support forever. 

> This is a major change that should be merged with more caution.
> Could this have too much of an impact on performance?
> 
> Thanks,
> Stefano

It's really a POC, real patch is left as an excersise for the reader :).
The correct approach IMHO is to only start using this
when we wasted a lot of memory on small packets.

For example, if sum(truesize) >= buf size.

then we'll not see a perf impact unless it's already pathological.


> > Yet it seems to work fine in light testing.
> > 
> > -->
> > 
> > 
> > vsock/virtio: use DWARF ULEB128 to record EOM boundaries, enable cross-EOM 
> > skb coalescing
> > 
> > virtio_transport_recv_enqueue() currently refuses to coalesce an
> > incoming skb with the previous one when the previous skb carries
> > VIRTIO_VSOCK_SEQ_EOM.  This forces one skb per seqpacket message.
> > For workloads with many small or zero-byte messages the per-skb
> > overhead (~960 bytes) dominates, causing unbounded memory growth.
> > 
> > Decouple message boundary tracking from the skb structure: store
> > boundary offsets in a compact side buffer using DWARF ULEB128
> > encoding with the EOR flag folded into the low bit, then allow
> > the data of multiple complete messages to be coalesced into a single
> > skb.
> > 
> > Cross-EOM coalescing fires only when:
> > - both the tail skb and the incoming packet carry EOM (complete msgs)
> > - the incoming packet fits in the tail skb's tailroom
> > - no BPF psock is attached (read_skb expects one msg per skb)
> > 
> > On allocation failure the code falls back to separate skbs (existing
> > behaviour).  Credit accounting is unchanged; the boundary buffer is
> > capped at PAGE_SIZE.
> > 
> > Signed-off-by: Michael S. Tsirkin <[email protected]>
> > Co-Authored-By: Claude Opus 4.6 (1M context) <[email protected]>
> > 
> > diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
> > index f91704731057..e36b9ab28372 100644
> > --- a/include/linux/virtio_vsock.h
> > +++ b/include/linux/virtio_vsock.h
> > @@ -12,6 +12,7 @@
> > struct virtio_vsock_skb_cb {
> >     bool reply;
> >     bool tap_delivered;
> > +   bool has_boundary_entries;
> >     u32 offset;
> > };
> > 
> > @@ -167,6 +168,12 @@ struct virtio_vsock_sock {
> >     u32 buf_used;
> >     struct sk_buff_head rx_queue;
> >     u32 msg_count;
> > +
> > +   /* ULEB128-encoded seqpacket message boundary buffer */
> > +   u8 *boundary_buf;
> > +   u32 boundary_len;
> > +   u32 boundary_alloc;
> > +   u32 boundary_off;
> > };
> > 
> > struct virtio_vsock_pkt_info {
> > diff --git a/net/vmw_vsock/virtio_transport_common.c 
> > b/net/vmw_vsock/virtio_transport_common.c
> > index 416d533f493d..81654f70f72c 100644
> > --- a/net/vmw_vsock/virtio_transport_common.c
> > +++ b/net/vmw_vsock/virtio_transport_common.c
> > @@ -11,6 +11,7 @@
> > #include <linux/sched/signal.h>
> > #include <linux/ctype.h>
> > #include <linux/list.h>
> > +#include <linux/skmsg.h>
> > #include <linux/virtio_vsock.h>
> > #include <uapi/linux/vsockmon.h>
> > 
> > @@ -26,6 +27,91 @@
> > /* Threshold for detecting small packets to copy */
> > #define GOOD_COPY_LEN  128
> > 
> > +#define VSOCK_BOUNDARY_BUF_INIT    64
> > +#define VSOCK_BOUNDARY_BUF_MAX     PAGE_SIZE
> > +
> > +/* ULEB128 boundary encoding: value = (msg_len << 1) | eor.
> > + * Each byte carries 7 data bits; bit 7 is set on all but the last byte.
> > + * Max 5 bytes for a u32 msg_len (33 bits with eor shift).
> > + */
> > +static int vsock_uleb_encode_boundary(u8 *buf, u32 msg_len, bool eor)
> > +{
> > +   u64 val = ((u64)msg_len << 1) | eor;
> > +   int n = 0;
> > +
> > +   do {
> > +           buf[n] = val & 0x7f;
> > +           val >>= 7;
> > +           if (val)
> > +                   buf[n] |= 0x80;
> > +           n++;
> > +   } while (val);
> > +
> > +   return n;
> > +}
> > +
> > +static int vsock_uleb_decode_boundary(const u8 *buf, u32 avail,
> > +                                 u32 *msg_len, bool *eor)
> > +{
> > +   u64 val = 0;
> > +   int shift = 0;
> > +   int n = 0;
> > +
> > +   do {
> > +           if (n >= avail || shift >= 35)
> > +                   return -EINVAL;
> > +           val |= (u64)(buf[n] & 0x7f) << shift;
> > +           shift += 7;
> > +   } while (buf[n++] & 0x80);
> > +
> > +   *eor = val & 1;
> > +   *msg_len = val >> 1;
> > +   return n;
> > +}
> > +
> > +static void vsock_boundary_buf_compact(struct virtio_vsock_sock *vvs)
> > +{
> > +   if (vvs->boundary_off == 0)
> > +           return;
> > +
> > +   vvs->boundary_len -= vvs->boundary_off;
> > +   memmove(vvs->boundary_buf, vvs->boundary_buf + vvs->boundary_off,
> > +           vvs->boundary_len);
> > +   vvs->boundary_off = 0;
> > +}
> > +
> > +static int vsock_boundary_buf_ensure(struct virtio_vsock_sock *vvs, u32 
> > needed)
> > +{
> > +   u32 new_alloc;
> > +   u8 *new_buf;
> > +
> > +   if (vvs->boundary_alloc >= needed)
> > +           return 0;
> > +
> > +   /* Reclaim consumed space before growing */
> > +   if (vvs->boundary_off) {
> > +           needed -= vvs->boundary_off;
> > +           vsock_boundary_buf_compact(vvs);
> > +           if (vvs->boundary_alloc >= needed)
> > +                   return 0;
> > +   }
> > +
> > +   new_alloc = max(needed, vvs->boundary_alloc ? vvs->boundary_alloc * 2
> > +                                               : VSOCK_BOUNDARY_BUF_INIT);
> > +   if (new_alloc > VSOCK_BOUNDARY_BUF_MAX)
> > +           new_alloc = VSOCK_BOUNDARY_BUF_MAX;
> > +   if (new_alloc < needed)
> > +           return -ENOMEM;
> > +
> > +   new_buf = krealloc(vvs->boundary_buf, new_alloc, GFP_ATOMIC);
> > +   if (!new_buf)
> > +           return -ENOMEM;
> > +
> > +   vvs->boundary_buf = new_buf;
> > +   vvs->boundary_alloc = new_alloc;
> > +   return 0;
> > +}
> > +
> > static void virtio_transport_cancel_close_work(struct vsock_sock *vsk,
> >                                            bool cancel_timeout);
> > static s64 virtio_transport_has_space(struct virtio_vsock_sock *vvs);
> > @@ -682,41 +768,74 @@ virtio_transport_seqpacket_do_peek(struct vsock_sock 
> > *vsk,
> >     total = 0;
> >     len = msg_data_left(msg);
> > 
> > -   skb_queue_walk(&vvs->rx_queue, skb) {
> > -           struct virtio_vsock_hdr *hdr;
> > +   skb = skb_peek(&vvs->rx_queue);
> > +   if (skb && VIRTIO_VSOCK_SKB_CB(skb)->has_boundary_entries) {
> > +           u32 msg_len, offset;
> > +           size_t bytes;
> > +           bool eor;
> > +           int ret;
> > 
> > -           if (total < len) {
> > -                   size_t bytes;
> > +           ret = vsock_uleb_decode_boundary(
> > +                   vvs->boundary_buf + vvs->boundary_off,
> > +                   vvs->boundary_len - vvs->boundary_off,
> > +                   &msg_len, &eor);
> > +           if (ret < 0)
> > +                   goto unlock;
> > +
> > +           offset = VIRTIO_VSOCK_SKB_CB(skb)->offset;
> > +           bytes = min(len, (size_t)msg_len);
> > +
> > +           if (bytes) {
> >                     int err;
> > 
> > -                   bytes = len - total;
> > -                   if (bytes > skb->len)
> > -                           bytes = skb->len;
> > -
> >                     spin_unlock_bh(&vvs->rx_lock);
> > -
> > -                   /* sk_lock is held by caller so no one else can dequeue.
> > -                    * Unlock rx_lock since skb_copy_datagram_iter() may 
> > sleep.
> > -                    */
> > -                   err = skb_copy_datagram_iter(skb, 
> > VIRTIO_VSOCK_SKB_CB(skb)->offset,
> > +                   err = skb_copy_datagram_iter(skb, offset,
> >                                                  &msg->msg_iter, bytes);
> >                     if (err)
> >                             return err;
> > -
> >                     spin_lock_bh(&vvs->rx_lock);
> >             }
> > 
> > -           total += skb->len;
> > -           hdr = virtio_vsock_hdr(skb);
> > +           total = msg_len;
> > +           if (eor)
> > +                   msg->msg_flags |= MSG_EOR;
> > +   } else {
> > +           skb_queue_walk(&vvs->rx_queue, skb) {
> > +                   struct virtio_vsock_hdr *hdr;
> > 
> > -           if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOM) {
> > -                   if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOR)
> > -                           msg->msg_flags |= MSG_EOR;
> > +                   if (total < len) {
> > +                           size_t bytes;
> > +                           int err;
> > 
> > -                   break;
> > +                           bytes = len - total;
> > +                           if (bytes > skb->len)
> > +                                   bytes = skb->len;
> > +
> > +                           spin_unlock_bh(&vvs->rx_lock);
> > +
> > +                           err = skb_copy_datagram_iter(
> > +                                   skb,
> > +                                   VIRTIO_VSOCK_SKB_CB(skb)->offset,
> > +                                   &msg->msg_iter, bytes);
> > +                           if (err)
> > +                                   return err;
> > +
> > +                           spin_lock_bh(&vvs->rx_lock);
> > +                   }
> > +
> > +                   total += skb->len;
> > +                   hdr = virtio_vsock_hdr(skb);
> > +
> > +                   if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOM) {
> > +                           if (le32_to_cpu(hdr->flags) &
> > +                               VIRTIO_VSOCK_SEQ_EOR)
> > +                                   msg->msg_flags |= MSG_EOR;
> > +                           break;
> > +                   }
> >             }
> >     }
> > 
> > +unlock:
> >     spin_unlock_bh(&vvs->rx_lock);
> > 
> >     return total;
> > @@ -740,57 +859,105 @@ static int 
> > virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk,
> >     }
> > 
> >     while (!msg_ready) {
> > -           struct virtio_vsock_hdr *hdr;
> > -           size_t pkt_len;
> > -
> > -           skb = __skb_dequeue(&vvs->rx_queue);
> > +           skb = skb_peek(&vvs->rx_queue);
> >             if (!skb)
> >                     break;
> > -           hdr = virtio_vsock_hdr(skb);
> > -           pkt_len = (size_t)le32_to_cpu(hdr->len);
> > 
> > -           if (dequeued_len >= 0) {
> > +           if (VIRTIO_VSOCK_SKB_CB(skb)->has_boundary_entries) {
> >                     size_t bytes_to_copy;
> > +                   u32 msg_len, offset;
> > +                   bool eor;
> > +                   int ret;
> > 
> > -                   bytes_to_copy = min(user_buf_len, pkt_len);
> > +                   ret = vsock_uleb_decode_boundary(
> > +                           vvs->boundary_buf + vvs->boundary_off,
> > +                           vvs->boundary_len - vvs->boundary_off,
> > +                           &msg_len, &eor);
> > +                   if (ret < 0)
> > +                           break;
> > +                   vvs->boundary_off += ret;
> > 
> > -                   if (bytes_to_copy) {
> > +                   offset = VIRTIO_VSOCK_SKB_CB(skb)->offset;
> > +                   bytes_to_copy = min(user_buf_len, (size_t)msg_len);
> > +
> > +                   if (bytes_to_copy && dequeued_len >= 0) {
> >                             int err;
> > 
> > -                           /* sk_lock is held by caller so no one else can 
> > dequeue.
> > -                            * Unlock rx_lock since 
> > skb_copy_datagram_iter() may sleep.
> > -                            */
> >                             spin_unlock_bh(&vvs->rx_lock);
> > -
> > -                           err = skb_copy_datagram_iter(skb, 0,
> > +                           err = skb_copy_datagram_iter(skb, offset,
> >                                                          &msg->msg_iter,
> >                                                          bytes_to_copy);
> > -                           if (err) {
> > -                                   /* Copy of message failed. Rest of
> > -                                    * fragments will be freed without copy.
> > -                                    */
> > -                                   dequeued_len = err;
> > -                           } else {
> > -                                   user_buf_len -= bytes_to_copy;
> > -                           }
> > -
> >                             spin_lock_bh(&vvs->rx_lock);
> > +                           if (err)
> > +                                   dequeued_len = err;
> > +                           else
> > +                                   user_buf_len -= bytes_to_copy;
> >                     }
> > 
> >                     if (dequeued_len >= 0)
> > -                           dequeued_len += pkt_len;
> > -           }
> > +                           dequeued_len += msg_len;
> > 
> > -           if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOM) {
> > +                   VIRTIO_VSOCK_SKB_CB(skb)->offset += msg_len;
> >                     msg_ready = true;
> >                     vvs->msg_count--;
> > 
> > -                   if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOR)
> > +                   if (eor)
> >                             msg->msg_flags |= MSG_EOR;
> > -           }
> > 
> > -           virtio_transport_dec_rx_pkt(vvs, pkt_len, pkt_len);
> > -           kfree_skb(skb);
> > +                   virtio_transport_dec_rx_pkt(vvs, msg_len, msg_len);
> > +
> > +                   if (VIRTIO_VSOCK_SKB_CB(skb)->offset >= skb->len) {
> > +                           __skb_unlink(skb, &vvs->rx_queue);
> > +                           kfree_skb(skb);
> > +                   }
> > +
> > +                   if (vvs->boundary_off >= vvs->boundary_len / 2)
> > +                           vsock_boundary_buf_compact(vvs);
> > +           } else {
> > +                   struct virtio_vsock_hdr *hdr;
> > +                   size_t pkt_len;
> > +
> > +                   skb = __skb_dequeue(&vvs->rx_queue);
> > +                   if (!skb)
> > +                           break;
> > +                   hdr = virtio_vsock_hdr(skb);
> > +                   pkt_len = (size_t)le32_to_cpu(hdr->len);
> > +
> > +                   if (dequeued_len >= 0) {
> > +                           size_t bytes_to_copy;
> > +
> > +                           bytes_to_copy = min(user_buf_len, pkt_len);
> > +
> > +                           if (bytes_to_copy) {
> > +                                   int err;
> > +
> > +                                   spin_unlock_bh(&vvs->rx_lock);
> > +                                   err = skb_copy_datagram_iter(
> > +                                           skb, 0, &msg->msg_iter,
> > +                                           bytes_to_copy);
> > +                                   if (err)
> > +                                           dequeued_len = err;
> > +                                   else
> > +                                           user_buf_len -= bytes_to_copy;
> > +                                   spin_lock_bh(&vvs->rx_lock);
> > +                           }
> > +
> > +                           if (dequeued_len >= 0)
> > +                                   dequeued_len += pkt_len;
> > +                   }
> > +
> > +                   if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOM) {
> > +                           msg_ready = true;
> > +                           vvs->msg_count--;
> > +
> > +                           if (le32_to_cpu(hdr->flags) &
> > +                               VIRTIO_VSOCK_SEQ_EOR)
> > +                                   msg->msg_flags |= MSG_EOR;
> > +                   }
> > +
> > +                   virtio_transport_dec_rx_pkt(vvs, pkt_len, pkt_len);
> > +                   kfree_skb(skb);
> > +           }
> >     }
> > 
> >     spin_unlock_bh(&vvs->rx_lock);
> > @@ -1132,6 +1299,7 @@ void virtio_transport_destruct(struct vsock_sock *vsk)
> > 
> >     virtio_transport_cancel_close_work(vsk, true);
> > 
> > +   kfree(vvs->boundary_buf);
> >     kfree(vvs);
> >     vsk->trans = NULL;
> > }
> > @@ -1224,6 +1392,11 @@ static void virtio_transport_remove_sock(struct 
> > vsock_sock *vsk)
> >      * removing it.
> >      */
> >     __skb_queue_purge(&vvs->rx_queue);
> > +   kfree(vvs->boundary_buf);
> > +   vvs->boundary_buf = NULL;
> > +   vvs->boundary_len = 0;
> > +   vvs->boundary_alloc = 0;
> > +   vvs->boundary_off = 0;
> >     vsock_remove_sock(vsk);
> > }
> > 
> > @@ -1395,23 +1568,62 @@ virtio_transport_recv_enqueue(struct vsock_sock 
> > *vsk,
> >         !skb_is_nonlinear(skb)) {
> >             struct virtio_vsock_hdr *last_hdr;
> >             struct sk_buff *last_skb;
> > +           bool last_has_eom;
> > +           bool has_eom;
> > 
> >             last_skb = skb_peek_tail(&vvs->rx_queue);
> >             last_hdr = virtio_vsock_hdr(last_skb);
> > +           last_has_eom = le32_to_cpu(last_hdr->flags) & 
> > VIRTIO_VSOCK_SEQ_EOM;
> > +           has_eom = le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOM;
> > 
> > -           /* If there is space in the last packet queued, we copy the
> > -            * new packet in its buffer. We avoid this if the last packet
> > -            * queued has VIRTIO_VSOCK_SEQ_EOM set, because this is
> > -            * delimiter of SEQPACKET message, so 'pkt' is the first packet
> > -            * of a new message.
> > -            */
> > -           if (skb->len < skb_tailroom(last_skb) &&
> > -               !(le32_to_cpu(last_hdr->flags) & VIRTIO_VSOCK_SEQ_EOM)) {
> > -                   memcpy(skb_put(last_skb, skb->len), skb->data, 
> > skb->len);
> > -                   free_pkt = true;
> > -                   last_hdr->flags |= hdr->flags;
> > -                   le32_add_cpu(&last_hdr->len, len);
> > -                   goto out;
> > +           if (skb->len < skb_tailroom(last_skb)) {
> > +                   if (!last_has_eom) {
> > +                           /* Same-message coalescing (existing path) */
> > +                           memcpy(skb_put(last_skb, skb->len),
> > +                                  skb->data, skb->len);
> > +                           free_pkt = true;
> > +                           last_hdr->flags |= hdr->flags;
> > +                           le32_add_cpu(&last_hdr->len, len);
> > +                           goto out;
> > +                   }
> > +
> > +                   /* Cross-EOM: coalesce complete messages into one skb,
> > +                    * recording message boundaries in a compact BER buffer.
> > +                    * Only when incoming packet also has EOM (complete 
> > msg).
> > +                    */
> > +                   if (has_eom && !sk_psock(sk_vsock(vsk))) {
> > +                           bool prev_eor, cur_eor;
> > +                           u8 tmp[12];
> > +                           int n = 0;
> > +
> > +                           cur_eor = le32_to_cpu(hdr->flags) &
> > +                                     VIRTIO_VSOCK_SEQ_EOR;
> > +
> > +                           if 
> > (!VIRTIO_VSOCK_SKB_CB(last_skb)->has_boundary_entries) {
> > +                                   u32 prev_len = 
> > le32_to_cpu(last_hdr->len);
> > +
> > +                                   prev_eor = le32_to_cpu(last_hdr->flags) 
> > &
> > +                                              VIRTIO_VSOCK_SEQ_EOR;
> > +                                   n += vsock_uleb_encode_boundary(
> > +                                           tmp + n, prev_len, prev_eor);
> > +                           }
> > +                           n += vsock_uleb_encode_boundary(
> > +                                   tmp + n, len, cur_eor);
> > +
> > +                           if (!vsock_boundary_buf_ensure(
> > +                                       vvs, vvs->boundary_len + n)) {
> > +                                   memcpy(vvs->boundary_buf +
> > +                                          vvs->boundary_len, tmp, n);
> > +                                   vvs->boundary_len += n;
> > +                                   
> > VIRTIO_VSOCK_SKB_CB(last_skb)->has_boundary_entries = true;
> > +                                   memcpy(skb_put(last_skb, skb->len),
> > +                                          skb->data, skb->len);
> > +                                   free_pkt = true;
> > +                                   last_hdr->flags |= hdr->flags;
> > +                                   le32_add_cpu(&last_hdr->len, len);
> > +                                   goto out;
> > +                           }
> > +                   }
> >             }
> >     }
> > 
> > 


Reply via email to