The previous splice_read implementation copies all data through
intermediate pages (alloc_page + memcpy). This is wasteful for
skb fragment pages which are allocated from the page allocator
and can be safely referenced via get_page().

Optimize by checking PageSlab() to distinguish between linear
skb data (slab-backed) and fragment pages (page allocator-backed):

- For slab pages (skb linear data): copy to a page fragment via
  sk_page_frag, matching what linear_to_page() does in the
  standard TCP splice path (skb_splice_bits). get_page() is
  invalid on slab pages so a copy is unavoidable here.
- For non-slab pages (skb frags): use get_page() directly for
  true zero-copy, same as skb_splice_bits does for fragments.

Both paths use nosteal_pipe_buf_ops. The sk_page_frag approach
is more memory-efficient than alloc_page for small linear copies,
as multiple copies can share a single page fragment.

Benchmark results with rx-verdict-ingress mode (loopback, 8 CPUs):

  splice(2) + always-copy:  ~2770 MB/s (before this patch)
  splice(2) + zero-copy:    ~4270 MB/s (after this patch, +54%)
  read(2):                  ~4292 MB/s (baseline for reference)

Signed-off-by: Jiayuan Chen <[email protected]>
---
 net/ipv4/tcp_bpf.c | 41 +++++++++++++++++++++++++++++++----------
 1 file changed, 31 insertions(+), 10 deletions(-)

diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index e85a27e32ea7..13506ba7672f 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -447,6 +447,7 @@ static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr 
*msg, size_t len,
 
 struct tcp_bpf_splice_ctx {
        struct pipe_inode_info *pipe;
+       struct sock *sk;
 };
 
 static int sk_msg_splice_actor(void *arg, struct page *page,
@@ -458,13 +459,33 @@ static int sk_msg_splice_actor(void *arg, struct page 
*page,
        };
        ssize_t ret;
 
-       buf.page = alloc_page(GFP_KERNEL);
-       if (!buf.page)
-               return 0;
+       if (PageSlab(page)) {
+               /*
+                * skb linear data is backed by slab memory where
+                * get_page() is invalid. Copy to a page fragment from
+                * the socket's page allocator, matching what
+                * linear_to_page() does in the standard TCP splice
+                * path (skb_splice_bits).
+                */
+               struct page_frag *pfrag = sk_page_frag(ctx->sk);
+
+               if (!sk_page_frag_refill(ctx->sk, pfrag))
+                       return 0;
 
-       memcpy(page_address(buf.page), page_address(page) + offset, len);
-       buf.offset = 0;
-       buf.len = len;
+               len = min_t(size_t, len, pfrag->size - pfrag->offset);
+               memcpy(page_address(pfrag->page) + pfrag->offset,
+                      page_address(page) + offset, len);
+               buf.page = pfrag->page;
+               buf.offset = pfrag->offset;
+               buf.len = len;
+               pfrag->offset += len;
+       } else {
+               buf.page = page;
+               buf.offset = offset;
+               buf.len = len;
+       }
+
+       get_page(buf.page);
 
        /*
         * add_to_pipe() calls pipe_buf_release() on failure, which
@@ -481,9 +502,9 @@ static ssize_t tcp_bpf_splice_read(struct socket *sock, 
loff_t *ppos,
                                   struct pipe_inode_info *pipe, size_t len,
                                   unsigned int flags)
 {
-       struct tcp_bpf_splice_ctx ctx = { .pipe = pipe };
-       int bpf_flags = flags & SPLICE_F_NONBLOCK ? MSG_DONTWAIT : 0;
        struct sock *sk = sock->sk;
+       struct tcp_bpf_splice_ctx ctx = { .pipe = pipe, .sk = sk };
+       int bpf_flags = flags & SPLICE_F_NONBLOCK ? MSG_DONTWAIT : 0;
        struct sk_psock *psock;
        int ret;
 
@@ -508,9 +529,9 @@ static ssize_t tcp_bpf_splice_read_parser(struct socket 
*sock, loff_t *ppos,
                                          struct pipe_inode_info *pipe,
                                          size_t len, unsigned int flags)
 {
-       struct tcp_bpf_splice_ctx ctx = { .pipe = pipe };
-       int bpf_flags = flags & SPLICE_F_NONBLOCK ? MSG_DONTWAIT : 0;
        struct sock *sk = sock->sk;
+       struct tcp_bpf_splice_ctx ctx = { .pipe = pipe, .sk = sk };
+       int bpf_flags = flags & SPLICE_F_NONBLOCK ? MSG_DONTWAIT : 0;
        struct sk_psock *psock;
        int ret;
 
-- 
2.43.0


Reply via email to