On Wed, Nov 21, 2018 at 9:52 AM, Eric Dumazet <eduma...@google.com> wrote: > > In case GRO is not as efficient as it should be or disabled, > we might have a user thread trapped in __release_sock() while > softirq handler flood packets up to the point we have to drop. > > This patch balances work done from user thread and softirq, > to give more chances to __release_sock() to complete its work. > > This also helps if we receive many ACK packets, since GRO > does not aggregate them. > > Signed-off-by: Eric Dumazet <eduma...@google.com> > Tested-by: Jean-Louis Dupond <jean-lo...@dupond.be> > Cc: Neal Cardwell <ncardw...@google.com> > Cc: Yuchung Cheng <ych...@google.com> > --- > include/uapi/linux/snmp.h | 1 + > net/ipv4/proc.c | 1 + > net/ipv4/tcp_ipv4.c | 75 +++++++++++++++++++++++++++++++++++---- > 3 files changed, 71 insertions(+), 6 deletions(-) > > diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h > index > f80135e5feaa886000009db6dff75b2bc2d637b2..86dc24a96c90ab047d5173d625450facd6c6dd79 > 100644 > --- a/include/uapi/linux/snmp.h > +++ b/include/uapi/linux/snmp.h > @@ -243,6 +243,7 @@ enum > LINUX_MIB_TCPREQQFULLDROP, /* TCPReqQFullDrop */ > LINUX_MIB_TCPRETRANSFAIL, /* TCPRetransFail */ > LINUX_MIB_TCPRCVCOALESCE, /* TCPRcvCoalesce */ > + LINUX_MIB_TCPBACKLOGCOALESCE, /* TCPBacklogCoalesce */ > LINUX_MIB_TCPOFOQUEUE, /* TCPOFOQueue */ > LINUX_MIB_TCPOFODROP, /* TCPOFODrop */ > LINUX_MIB_TCPOFOMERGE, /* TCPOFOMerge */ > diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c > index > 70289682a6701438aed99a00a9705c39fa4394d3..c3610b37bb4ce665b1976d8cc907b6dd0de42ab9 > 100644 > --- a/net/ipv4/proc.c > +++ b/net/ipv4/proc.c > @@ -219,6 +219,7 @@ static const struct snmp_mib snmp4_net_list[] = { > SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL), > SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL), > SNMP_MIB_ITEM("TCPRcvCollapsed", LINUX_MIB_TCPRCVCOLLAPSED), > + SNMP_MIB_ITEM("TCPBacklogCoalesce", LINUX_MIB_TCPBACKLOGCOALESCE), > SNMP_MIB_ITEM("TCPDSACKOldSent", LINUX_MIB_TCPDSACKOLDSENT), > SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT), > SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV), > diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c > index > 795605a2327504b8a025405826e7e0ca8dc8501d..401e1d1cb904a4c7963d8baa419cfbf178593344 > 100644 > --- a/net/ipv4/tcp_ipv4.c > +++ b/net/ipv4/tcp_ipv4.c > @@ -1619,12 +1619,10 @@ int tcp_v4_early_demux(struct sk_buff *skb) > bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) > { > u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf; > - > - /* Only socket owner can try to collapse/prune rx queues > - * to reduce memory overhead, so add a little headroom here. > - * Few sockets backlog are possibly concurrently non empty. > - */ > - limit += 64*1024; > + struct skb_shared_info *shinfo; > + const struct tcphdr *th; > + struct sk_buff *tail; > + unsigned int hdrlen; > > /* In case all data was pulled from skb frags (in __pskb_pull_tail()), > * we can fix skb->truesize to its real value to avoid future drops. > @@ -1636,6 +1634,71 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff > *skb) > > skb_dst_drop(skb); > > + if (unlikely(tcp_checksum_complete(skb))) { > + bh_unlock_sock(sk); > + __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); > + __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); > + return true; > + } > + > + /* Attempt coalescing to last skb in backlog, even if we are > + * above the limits. > + * This is okay because skb capacity is limited to MAX_SKB_FRAGS. > + */ > + th = (const struct tcphdr *)skb->data; > + hdrlen = th->doff * 4; > + shinfo = skb_shinfo(skb); > + > + if (!shinfo->gso_size) > + shinfo->gso_size = skb->len - hdrlen; > + > + if (!shinfo->gso_segs) > + shinfo->gso_segs = 1; > + > + tail = sk->sk_backlog.tail; > + if (tail && > + TCP_SKB_CB(tail)->end_seq == TCP_SKB_CB(skb)->seq && > +#ifdef CONFIG_TLS_DEVICE > + tail->decrypted == skb->decrypted && > +#endif > + !memcmp(tail->data + sizeof(*th), skb->data + sizeof(*th), > + hdrlen - sizeof(*th))) { > + bool fragstolen; > + int delta; > + > + __skb_pull(skb, hdrlen); > + if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { > + TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; > + TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; > + TCP_SKB_CB(tail)->tcp_flags |= > TCP_SKB_CB(skb)->tcp_flags; > + > + if (TCP_SKB_CB(skb)->has_rxtstamp) { > + TCP_SKB_CB(tail)->has_rxtstamp = true; > + tail->tstamp = skb->tstamp; > + skb_hwtstamps(tail)->hwtstamp = > skb_hwtstamps(skb)->hwtstamp; > + } > + Really nice! would it make sense to re-use (some of) the similar tcp_try_coalesce()?
> + /* Not as strict as GRO. We only need to carry mss > max value */ > + skb_shinfo(tail)->gso_size = max(shinfo->gso_size, > + > skb_shinfo(tail)->gso_size); > + > + skb_shinfo(tail)->gso_segs += shinfo->gso_segs; > + > + sk->sk_backlog.len += delta; > + __NET_INC_STATS(sock_net(sk), > + LINUX_MIB_TCPBACKLOGCOALESCE); > + kfree_skb_partial(skb, fragstolen); > + return false; > + } > + __skb_push(skb, hdrlen); > + } > + > + /* Only socket owner can try to collapse/prune rx queues > + * to reduce memory overhead, so add a little headroom here. > + * Few sockets backlog are possibly concurrently non empty. > + */ > + limit += 64*1024; > + > if (unlikely(sk_add_backlog(sk, skb, limit))) { > bh_unlock_sock(sk); > __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); > -- > 2.19.1.1215.g8438c0b245-goog >