On Fri, May 05, 2023 at 05:54:49PM +0200, Alexander Bluhm wrote: > Claudio suggested to implement TCP send offloading in software as > a fallback if hardware cannot do it.
Updated diff below: - some cleanup has been commited - pf route-to should work now - disable TSO if IP options are set Not sure if I addressed all corner cases already. I think IPsec is missing. If someone wants to test it, feel free. It has impact on outgoing TCP traffic that is locally generated. bluhm Index: net/pf.c =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/net/pf.c,v retrieving revision 1.1176 diff -u -p -r1.1176 pf.c --- net/pf.c 7 May 2023 16:23:23 -0000 1.1176 +++ net/pf.c 7 May 2023 18:50:16 -0000 @@ -6548,8 +6548,6 @@ pf_route(struct pf_pdesc *pd, struct pf_ ip = mtod(m0, struct ip *); } - in_proto_cksum_out(m0, ifp); - if (ntohs(ip->ip_len) <= ifp->if_mtu) { ip->ip_sum = 0; if (ifp->if_capabilities & IFCAP_CSUM_IPv4) @@ -6558,10 +6556,21 @@ pf_route(struct pf_pdesc *pd, struct pf_ ipstat_inc(ips_outswcsum); ip->ip_sum = in_cksum(m0, ip->ip_hl << 2); } + in_proto_cksum_out(m0, ifp); + ifp->if_output(ifp, m0, sintosa(dst), rt); goto done; } + if (ISSET(m0->m_pkthdr.csum_flags, M_TCP_TSO) && + m0->m_pkthdr.ph_mss <= ifp->if_mtu) { + if (tcp_chopper(m0, &ml, ifp, m0->m_pkthdr.ph_mss) || + if_output_ml(ifp, &ml, sintosa(dst), rt)) + goto done; + tcpstat_inc(tcps_outswtso); + goto done; + } + /* * Too large for interface; fragment if possible. * Must be able to put at least 8 bytes per fragment. @@ -6595,6 +6604,7 @@ void pf_route6(struct pf_pdesc *pd, struct pf_state *st) { struct mbuf *m0; + struct mbuf_list ml; struct sockaddr_in6 *dst, sin6; struct rtentry *rt = NULL; struct ip6_hdr *ip6; @@ -6677,8 +6687,6 @@ pf_route6(struct pf_pdesc *pd, struct pf } } - in6_proto_cksum_out(m0, ifp); - /* * If packet has been reassembled by PF earlier, we have to * use pf_refragment6() here to turn it back to fragments. @@ -6689,9 +6697,19 @@ pf_route6(struct pf_pdesc *pd, struct pf } if ((u_long)m0->m_pkthdr.len <= ifp->if_mtu) { + in6_proto_cksum_out(m0, ifp); ifp->if_output(ifp, m0, sin6tosa(dst), rt); goto done; } + + if (ISSET(m0->m_pkthdr.csum_flags, M_TCP_TSO) && + m0->m_pkthdr.ph_mss <= ifp->if_mtu) { + if (tcp_chopper(m0, &ml, ifp, m0->m_pkthdr.ph_mss) || + if_output_ml(ifp, &ml, sin6tosa(dst), rt)) + goto done; + tcpstat_inc(tcps_outswtso); + goto done; + } ip6stat_inc(ip6s_cantfrag); if (st->rt != PF_DUPTO) Index: net/pf_norm.c =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/net/pf_norm.c,v retrieving revision 1.227 diff -u -p -r1.227 pf_norm.c --- net/pf_norm.c 7 May 2023 16:23:23 -0000 1.227 +++ net/pf_norm.c 7 May 2023 18:50:16 -0000 @@ -968,9 +968,6 @@ pf_refragment6(struct mbuf **m0, struct mtag = NULL; ftag = NULL; - /* Checksum must be calculated for the whole packet */ - in6_proto_cksum_out(m, NULL); - if (extoff) { int off; Index: netinet/in.h =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in.h,v retrieving revision 1.142 diff -u -p -r1.142 in.h --- netinet/in.h 11 Apr 2023 00:45:09 -0000 1.142 +++ netinet/in.h 7 May 2023 18:50:16 -0000 @@ -780,6 +780,7 @@ int in_canforward(struct in_addr); int in_cksum(struct mbuf *, int); int in4_cksum(struct mbuf *, u_int8_t, int, int); void in_proto_cksum_out(struct mbuf *, struct ifnet *); +int in_ifcap_cksum(struct mbuf *, struct ifnet *, int); void in_ifdetach(struct ifnet *); int in_mask2len(struct in_addr *); void in_len2mask(struct in_addr *, int); Index: netinet/ip_output.c =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_output.c,v retrieving revision 1.383 diff -u -p -r1.383 ip_output.c --- netinet/ip_output.c 7 May 2023 16:23:23 -0000 1.383 +++ netinet/ip_output.c 7 May 2023 18:50:16 -0000 @@ -84,7 +84,6 @@ void ip_mloopback(struct ifnet *, struct static __inline u_int16_t __attribute__((__unused__)) in_cksum_phdr(u_int32_t, u_int32_t, u_int32_t); void in_delayed_cksum(struct mbuf *); -int in_ifcap_cksum(struct mbuf *, struct ifnet *, int); int ip_output_ipsec_lookup(struct mbuf *m, int hlen, struct inpcb *inp, struct tdb **, int ipsecflowinfo); @@ -443,7 +442,6 @@ sendit: goto reroute; } #endif - in_proto_cksum_out(m, ifp); #ifdef IPSEC if (ipsec_in_use && (flags & IP_FORWARDING) && (ipforwarding == 2) && @@ -464,11 +462,21 @@ sendit: ipstat_inc(ips_outswcsum); ip->ip_sum = in_cksum(m, hlen); } + in_proto_cksum_out(m, ifp); error = ifp->if_output(ifp, m, sintosa(dst), ro->ro_rt); goto done; } + if (ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO) && + m->m_pkthdr.ph_mss <= mtu) { + if ((error = tcp_chopper(m, &ml, ifp, m->m_pkthdr.ph_mss)) || + (error = if_output_ml(ifp, &ml, sintosa(dst), ro->ro_rt))) + goto done; + tcpstat_inc(tcps_outswtso); + goto done; + } + /* * Too large for interface; fragment if possible. * Must be able to put at least 8 bytes per fragment. @@ -1861,7 +1869,11 @@ in_proto_cksum_out(struct mbuf *m, struc u_int16_t csum = 0, offset; offset = ip->ip_hl << 2; - if (m->m_pkthdr.csum_flags & (M_TCP_CSUM_OUT|M_UDP_CSUM_OUT)) + if (m->m_pkthdr.csum_flags & M_TCP_TSO) + csum = in_cksum_phdr(ip->ip_src.s_addr, + ip->ip_dst.s_addr, htonl(ip->ip_p)); + else if (m->m_pkthdr.csum_flags & + (M_TCP_CSUM_OUT|M_UDP_CSUM_OUT)) csum = in_cksum_phdr(ip->ip_src.s_addr, ip->ip_dst.s_addr, htonl(ntohs(ip->ip_len) - offset + ip->ip_p)); Index: netinet/tcp_output.c =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_output.c,v retrieving revision 1.135 diff -u -p -r1.135 tcp_output.c --- netinet/tcp_output.c 25 Apr 2023 22:56:28 -0000 1.135 +++ netinet/tcp_output.c 7 May 2023 18:50:16 -0000 @@ -210,6 +210,7 @@ tcp_output(struct tcpcb *tp) #ifdef TCP_ECN int needect; #endif + int tso; if (tp->t_flags & TF_BLOCKOUTPUT) { tp->t_flags |= TF_NEEDOUTPUT; @@ -279,6 +280,7 @@ again: } sendalot = 0; + tso = 0; /* * If in persist timeout with window of 0, send 1 byte. * Otherwise, if window is small but nonzero @@ -346,8 +348,25 @@ again: txmaxseg = ulmin(so->so_snd.sb_hiwat / 2, tp->t_maxseg); if (len > txmaxseg) { - len = txmaxseg; - sendalot = 1; + if (1 && + tp->t_inpcb->inp_options == NULL && + tp->t_inpcb->inp_outputopts6 == NULL && +#ifdef TCP_SIGNATURE + ((tp->t_flags & TF_SIGNATURE) == 0) && +#endif + len >= 2 * tp->t_maxseg && + tp->rcv_numsacks == 0 && sack_rxmit == 0 && + !(flags & (TH_SYN|TH_RST|TH_FIN))) { + tso = 1; + /* avoid small chopped packets */ + if (len > (len / tp->t_maxseg) * tp->t_maxseg) { + len = (len / tp->t_maxseg) * tp->t_maxseg; + sendalot = 1; + } + } else { + len = txmaxseg; + sendalot = 1; + } } if (off + len < so->so_snd.sb_cc) flags &= ~TH_FIN; @@ -365,7 +384,7 @@ again: * to send into a small window), then must resend. */ if (len) { - if (len == txmaxseg) + if (len >= txmaxseg) goto send; if ((idle || (tp->t_flags & TF_NODELAY)) && len + off >= so->so_snd.sb_cc && !soissending(so) && @@ -616,10 +635,19 @@ send: /* * Adjust data length if insertion of options will * bump the packet length beyond the t_maxopd length. + * Clear the FIN bit because we cut off the tail of + * the segment. */ if (len > tp->t_maxopd - optlen) { - len = tp->t_maxopd - optlen; - sendalot = 1; + if (tso) { + if (len + hdrlen + max_linkhdr > MAXMCLBYTES) { + len = MAXMCLBYTES - hdrlen - max_linkhdr; + sendalot = 1; + } + } else { + len = tp->t_maxopd - optlen; + sendalot = 1; + } flags &= ~TH_FIN; } @@ -723,6 +751,12 @@ send: m->m_pkthdr.ph_ifidx = 0; m->m_pkthdr.len = hdrlen + len; + /* Enable TSO and specify the size of the resulting segments. */ + if (tso) { + m->m_pkthdr.csum_flags |= M_TCP_TSO; + m->m_pkthdr.ph_mss = tp->t_maxseg; + } + if (!tp->t_template) panic("tcp_output"); #ifdef DIAGNOSTIC @@ -1152,4 +1186,182 @@ tcp_setpersist(struct tcpcb *tp) TCP_TIMER_ARM(tp, TCPT_PERSIST, msec); if (tp->t_rxtshift < TCP_MAXRXTSHIFT) tp->t_rxtshift++; +} + +int +tcp_chopper(struct mbuf *m0, struct mbuf_list *ml, struct ifnet *ifp, + u_long mss) +{ + struct ip *ip = NULL; +#ifdef INET6 + struct ip6_hdr *ip6 = NULL; +#endif + struct tcphdr *th; + int firstlen, iphlen, hlen, tlen, off; + int error; + + ml_init(ml); + ml_enqueue(ml, m0); + + ip = mtod(m0, struct ip *); + switch (ip->ip_v) { + case 4: + iphlen = ip->ip_hl << 2; + if (ISSET(ip->ip_off, htons(IP_OFFMASK | IP_MF)) || + iphlen != sizeof(struct ip) || ip->ip_p != IPPROTO_TCP) { + /* only TCP without fragment or IP option supported */ + error = EPROTOTYPE; + goto bad; + } + break; +#ifdef INET6 + case 6: + ip = NULL; + ip6 = mtod(m0, struct ip6_hdr *); + iphlen = sizeof(struct ip6_hdr); + if (ip6->ip6_nxt != IPPROTO_TCP) { + /* only TCP without IPv6 header chain supported */ + error = EPROTOTYPE; + goto bad; + } + break; +#endif + default: + panic("%s: unknown ip version %d", __func__, ip->ip_v); + } + + tlen = m0->m_pkthdr.len; + if (tlen < iphlen + sizeof(struct tcphdr)) { + error = EMSGSIZE; + goto bad; + } + /* IP and TCP header should be contiguous, this check is paranoia */ + if (m0->m_len < iphlen + sizeof(*th)) { + ml_dequeue(ml); + if ((m0 = m_pullup(m0, iphlen + sizeof(*th))) == NULL) { + error = ENOBUFS; + goto bad; + } + ml_enqueue(ml, m0); + } + th = (struct tcphdr *)(mtod(m0, caddr_t) + iphlen); + hlen = iphlen + (th->th_off << 2); + if (tlen < hlen) { + error = EMSGSIZE; + goto bad; + } + firstlen = MIN(tlen - hlen, mss); + + CLR(m0->m_pkthdr.csum_flags, M_TCP_TSO); + + /* + * Loop through length of payload after first segment, + * make new header and copy data of each part and link onto chain. + */ + for (off = hlen + firstlen; off < tlen; off += mss) { + struct mbuf *m; + struct tcphdr *mhth; + int len; + + len = MIN(tlen - off, mss); + + MGETHDR(m, M_DONTWAIT, MT_HEADER); + if (m == NULL) { + error = ENOBUFS; + goto bad; + } + ml_enqueue(ml, m); + if ((error = m_dup_pkthdr(m, m0, M_DONTWAIT)) != 0) + goto bad; + + /* IP and TCP header to the end, space for link layer header */ + m->m_len = hlen; + m_align(m, hlen); + + /* copy and adjust TCP header */ + mhth = (struct tcphdr *)(mtod(m, caddr_t) + iphlen); + memcpy(mhth, th, hlen - iphlen); + mhth->th_seq = htonl(ntohl(th->th_seq) + (off - hlen)); + if (off + len < tlen) + CLR(mhth->th_flags, TH_PUSH|TH_FIN); + + /* add mbuf chain with payload */ + m->m_pkthdr.len = hlen + len; + if ((m->m_next = m_copym(m0, off, len, M_DONTWAIT)) == NULL) { + error = ENOBUFS; + goto bad; + } + + /* copy and adjust IP header, calculate checksum */ + SET(m->m_pkthdr.csum_flags, M_TCP_CSUM_OUT); + mhth->th_sum = 0; + if (ip) { + struct ip *mhip; + + mhip = mtod(m, struct ip *); + *mhip = *ip; + mhip->ip_len = htons(hlen + len); + mhip->ip_id = htons(ip_randomid()); + mhip->ip_sum = 0; + if (ifp && in_ifcap_cksum(m, ifp, IFCAP_CSUM_IPv4)) { + m->m_pkthdr.csum_flags |= M_IPV4_CSUM_OUT; + } else { + ipstat_inc(ips_outswcsum); + mhip->ip_sum = in_cksum(m, iphlen); + } + in_proto_cksum_out(m, ifp); + } +#ifdef INET6 + if (ip6) { + struct ip6_hdr *mhip6; + + mhip6 = mtod(m, struct ip6_hdr *); + *mhip6 = *ip6; + mhip6->ip6_plen = htons(hlen - iphlen + len); + in6_proto_cksum_out(m, ifp); + } +#endif + } + + /* + * Update first segment by trimming what's been copied out + * and updating header, then send each segment (in order). + */ + if (hlen + firstlen < tlen) { + m_adj(m0, hlen + firstlen - tlen); + CLR(th->th_flags, TH_PUSH|TH_FIN); + } + /* adjust IP header, calculate checksum */ + SET(m0->m_pkthdr.csum_flags, M_TCP_CSUM_OUT); + th->th_sum = 0; + if (ip) { + ip->ip_len = htons(m0->m_pkthdr.len); + ip->ip_sum = 0; + if (ifp && in_ifcap_cksum(m0, ifp, IFCAP_CSUM_IPv4)) { + m0->m_pkthdr.csum_flags |= M_IPV4_CSUM_OUT; + } else { + ipstat_inc(ips_outswcsum); + ip->ip_sum = in_cksum(m0, iphlen); + } + in_proto_cksum_out(m0, ifp); + } +#ifdef INET6 + if (ip6) { + ip6->ip6_plen = htons(m0->m_pkthdr.len - iphlen); + in6_proto_cksum_out(m0, ifp); + } +#endif + + tcpstat_add(tcps_outtso, ml_len(ml)); + return 0; + + bad: + if (ip) + ipstat_inc(ips_odropped); +#ifdef INET6 + if (ip6) + ip6stat_inc(ip6s_odropped); +#endif + ml_purge(ml); + return error; } Index: netinet/tcp_usrreq.c =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_usrreq.c,v retrieving revision 1.217 diff -u -p -r1.217 tcp_usrreq.c --- netinet/tcp_usrreq.c 14 Mar 2023 00:24:05 -0000 1.217 +++ netinet/tcp_usrreq.c 7 May 2023 18:50:16 -0000 @@ -1335,6 +1335,9 @@ tcp_sysctl_tcpstat(void *oldp, size_t *o ASSIGN(tcps_sack_rcv_opts); ASSIGN(tcps_sack_snd_opts); ASSIGN(tcps_sack_drop_opts); + ASSIGN(tcps_outswtso); + ASSIGN(tcps_outhwtso); + ASSIGN(tcps_outtso); #undef ASSIGN Index: netinet/tcp_var.h =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_var.h,v retrieving revision 1.163 diff -u -p -r1.163 tcp_var.h --- netinet/tcp_var.h 14 Mar 2023 00:24:05 -0000 1.163 +++ netinet/tcp_var.h 7 May 2023 18:50:16 -0000 @@ -442,6 +442,10 @@ struct tcpstat { u_int64_t tcps_sack_rcv_opts; /* SACK options received */ u_int64_t tcps_sack_snd_opts; /* SACK options sent */ u_int64_t tcps_sack_drop_opts; /* SACK options dropped */ + + u_int32_t tcps_outswtso; /* output tso chopped in software */ + u_int32_t tcps_outhwtso; /* output tso processed by hardware */ + u_int32_t tcps_outtso; /* packets generated by tso */ }; /* @@ -614,6 +618,9 @@ enum tcpstat_counters { tcps_sack_rcv_opts, tcps_sack_snd_opts, tcps_sack_drop_opts, + tcps_outswtso, + tcps_outhwtso, + tcps_outtso, tcps_ncounters, }; @@ -706,6 +713,8 @@ struct tcpcb * tcp_newtcpcb(struct inpcb *, int); void tcp_notify(struct inpcb *, int); int tcp_output(struct tcpcb *); +int tcp_chopper(struct mbuf *, struct mbuf_list *, struct ifnet *, + u_long); void tcp_pulloutofband(struct socket *, u_int, struct mbuf *, int); int tcp_reass(struct tcpcb *, struct tcphdr *, struct mbuf *, int *); void tcp_rscale(struct tcpcb *, u_long); Index: netinet6/ip6_output.c =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/ip6_output.c,v retrieving revision 1.273 diff -u -p -r1.273 ip6_output.c --- netinet6/ip6_output.c 7 May 2023 16:23:24 -0000 1.273 +++ netinet6/ip6_output.c 7 May 2023 18:50:16 -0000 @@ -664,8 +664,6 @@ reroute: ip6->ip6_dst.s6_addr16[1] = dst_scope; } - in6_proto_cksum_out(m, ifp); - /* * Send the packet to the outgoing interface. * If necessary, do IPv6 fragmentation before sending. @@ -688,7 +686,8 @@ reroute: dontfrag = 1; else dontfrag = 0; - if (dontfrag && tlen > ifp->if_mtu) { /* case 2-b */ + if (dontfrag && tlen > ifp->if_mtu && + !ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO)) { /* case 2-b */ #ifdef IPSEC if (ip_mtudisc) ipsec_adjust_mtu(m, mtu); @@ -701,10 +700,21 @@ reroute: * transmit packet without fragmentation */ if (dontfrag || (tlen <= mtu)) { /* case 1-a and 2-a */ + in6_proto_cksum_out(m, ifp); + error = ifp->if_output(ifp, m, sin6tosa(dst), ro->ro_rt); goto done; } + if (ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO) && + m->m_pkthdr.ph_mss <= mtu) { + if ((error = tcp_chopper(m, &ml, ifp, m->m_pkthdr.ph_mss)) || + (error = if_output_ml(ifp, &ml, sin6tosa(dst), ro->ro_rt))) + goto done; + tcpstat_inc(tcps_outswtso); + goto done; + } + /* * try to fragment the packet. case 1-b */ @@ -2704,8 +2714,12 @@ in6_proto_cksum_out(struct mbuf *m, stru u_int16_t csum; offset = ip6_lasthdr(m, 0, IPPROTO_IPV6, &nxt); - csum = in6_cksum_phdr(&ip6->ip6_src, &ip6->ip6_dst, - htonl(m->m_pkthdr.len - offset), htonl(nxt)); + if (m->m_pkthdr.csum_flags & M_TCP_TSO) + csum = in6_cksum_phdr(&ip6->ip6_src, &ip6->ip6_dst, 0, + htonl(nxt)); + else + csum = in6_cksum_phdr(&ip6->ip6_src, &ip6->ip6_dst, + htonl(m->m_pkthdr.len - offset), htonl(nxt)); if (nxt == IPPROTO_TCP) offset += offsetof(struct tcphdr, th_sum); else if (nxt == IPPROTO_UDP) Index: sys/mbuf.h =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/sys/mbuf.h,v retrieving revision 1.256 diff -u -p -r1.256 mbuf.h --- sys/mbuf.h 5 May 2023 01:19:51 -0000 1.256 +++ sys/mbuf.h 7 May 2023 18:50:16 -0000 @@ -129,12 +129,13 @@ struct pkthdr { SLIST_HEAD(, m_tag) ph_tags; /* list of packet tags */ int64_t ph_timestamp; /* packet timestamp */ int len; /* total packet length */ + u_int ph_rtableid; /* routing table id */ + u_int ph_ifidx; /* rcv interface index */ u_int16_t ph_tagsset; /* mtags attached */ u_int16_t ph_flowid; /* pseudo unique flow id */ u_int16_t csum_flags; /* checksum flags */ u_int16_t ether_vtag; /* Ethernet 802.1p+Q vlan tag */ - u_int ph_rtableid; /* routing table id */ - u_int ph_ifidx; /* rcv interface index */ + u_int16_t ph_mss; /* TCP max segment size */ u_int8_t ph_loopcnt; /* mbuf is looping in kernel */ u_int8_t ph_family; /* af, used when queueing */ struct pkthdr_pf pf; @@ -226,6 +227,7 @@ struct mbuf { #define M_IPV6_DF_OUT 0x1000 /* don't fragment outgoing IPv6 */ #define M_TIMESTAMP 0x2000 /* ph_timestamp is set */ #define M_FLOWID 0x4000 /* ph_flowid is set */ +#define M_TCP_TSO 0x8000 /* TCP Segmentation Offload needed */ #ifdef _KERNEL #define MCS_BITS \