From: Paolo Abeni <pab...@redhat.com>

If the current sendmsg() lands on the same subflow we used last, we
can try to collapse the data.

Signed-off-by: Paolo Abeni <pab...@redhat.com>
---
 net/mptcp/protocol.c | 79 +++++++++++++++++++++++++++++++++-----------
 1 file changed, 60 insertions(+), 19 deletions(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index d51201c09519..3fb0f3163743 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -47,12 +47,25 @@ static struct sock *mptcp_subflow_get_ref(const struct 
mptcp_sock *msk)
        return NULL;
 }
 
+static inline bool mptcp_skb_can_collapse_to(const struct mptcp_sock *msk,
+                                            const struct sk_buff *skb,
+                                            const struct mptcp_ext *mpext)
+{
+       if (!tcp_skb_can_collapse_to(skb))
+               return false;
+
+       /* can collapse only if MPTCP level sequence is in order */
+       return mpext && mpext->data_seq + mpext->data_len == msk->write_seq;
+}
+
 static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
-                             struct msghdr *msg, long *timeo)
+                             struct msghdr *msg, long *timeo, int *pmss_now,
+                             int *ps_goal)
 {
+       int mss_now, avail_size, size_goal, ret;
        struct mptcp_sock *msk = mptcp_sk(sk);
+       bool collapsed, can_collapse = false;
        struct mptcp_ext *mpext = NULL;
-       int mss_now, size_goal, ret;
        struct page_frag *pfrag;
        struct sk_buff *skb;
        size_t psize;
@@ -69,8 +82,31 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock 
*ssk,
 
        /* compute copy limit */
        mss_now = tcp_send_mss(ssk, &size_goal, msg->msg_flags);
-       psize = min_t(int, pfrag->size - pfrag->offset, size_goal);
+       *pmss_now = mss_now;
+       *ps_goal = size_goal;
+       avail_size = size_goal;
+       skb = tcp_write_queue_tail(ssk);
+       if (skb) {
+               mpext = skb_ext_find(skb, SKB_EXT_MPTCP);
+
+               /* Limit the write to the size available in the
+                * current skb, if any, so that we create at most a new skb.
+                * If we run out of space in the current skb (e.g. the window
+                * size shrunk from last sent) a new skb will be allocated even
+                * is collapsing was allowed: collapsing is effectively
+                * disabled.
+                */
+               can_collapse = mptcp_skb_can_collapse_to(msk, skb, mpext);
+               if (!can_collapse)
+                       TCP_SKB_CB(skb)->eor = 1;
+               else if (size_goal - skb->len > 0)
+                       avail_size = size_goal - skb->len;
+               else
+                       can_collapse = false;
+       }
+       psize = min_t(size_t, pfrag->size - pfrag->offset, avail_size);
 
+       /* Copy to page */
        pr_debug("left=%zu", msg_data_left(msg));
        psize = copy_page_from_iter(pfrag->page, pfrag->offset,
                                    min_t(size_t, msg_data_left(msg), psize),
@@ -79,14 +115,9 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock 
*ssk,
        if (!psize)
                return -EINVAL;
 
-       /* Mark the end of the previous write so the beginning of the
-        * next write (with its own mptcp skb extension data) is not
-        * collapsed.
+       /* tell the TCP stack to delay the push so that we can safely
+        * access the skb after the sendpages call
         */
-       skb = tcp_write_queue_tail(ssk);
-       if (skb)
-               TCP_SKB_CB(skb)->eor = 1;
-
        ret = do_tcp_sendpages(ssk, pfrag->page, pfrag->offset, psize,
                               msg->msg_flags | MSG_SENDPAGE_NOTLAST);
        if (ret <= 0)
@@ -94,13 +125,16 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock 
*ssk,
        if (unlikely(ret < psize))
                iov_iter_revert(&msg->msg_iter, psize - ret);
 
-       if (skb == tcp_write_queue_tail(ssk))
-               pr_err("no new skb %p/%p", sk, ssk);
+       collapsed = skb == tcp_write_queue_tail(ssk);
+       BUG_ON(collapsed && !can_collapse);
+       if (collapsed) {
+               /* when collapsing mpext always exists */
+               mpext->data_len += ret;
+               goto out;
+       }
 
        skb = tcp_write_queue_tail(ssk);
-
        mpext = skb_ext_add(skb, SKB_EXT_MPTCP);
-
        if (mpext) {
                memset(mpext, 0, sizeof(*mpext));
                mpext->data_seq = msk->write_seq;
@@ -113,22 +147,25 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct 
sock *ssk,
                pr_debug("data_seq=%llu subflow_seq=%u data_len=%u checksum=%u, 
dsn64=%d",
                         mpext->data_seq, mpext->subflow_seq, mpext->data_len,
                         mpext->checksum, mpext->dsn64);
-       } /* TODO: else fallback */
+       }
+       /* TODO: else fallback; allocation can fail, but we can't easily retire
+        * skbs from the write_queue, as we need to roll-back TCP status
+        */
 
+out:
        pfrag->offset += ret;
        msk->write_seq += ret;
        subflow_ctx(ssk)->rel_write_seq += ret;
 
-       tcp_push(ssk, msg->msg_flags, mss_now, tcp_sk(ssk)->nonagle, size_goal);
        return ret;
 }
 
 static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 {
+       int mss_now = 0, size_goal = 0, ret = 0;
        struct mptcp_sock *msk = mptcp_sk(sk);
        size_t copied = 0;
        struct sock *ssk;
-       int ret = 0;
        long timeo;
 
        pr_debug("msk=%p", msk);
@@ -158,14 +195,18 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr 
*msg, size_t len)
        lock_sock(ssk);
        timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
        while (msg_data_left(msg)) {
-               ret = mptcp_sendmsg_frag(sk, ssk, msg, &timeo);
+               ret = mptcp_sendmsg_frag(sk, ssk, msg, &timeo, &mss_now,
+                                        &size_goal);
                if (ret < 0)
                        break;
 
                copied += ret;
        }
-       if (copied > 0)
+       if (copied) {
                ret = copied;
+               tcp_push(ssk, msg->msg_flags, mss_now, tcp_sk(ssk)->nonagle,
+                        size_goal);
+       }
 
        release_sock(ssk);
        release_sock(sk);
-- 
2.22.0

Reply via email to