sepherosa_gmail.com created this revision. sepherosa_gmail.com added reviewers: network, adrian, delphij, royger, decui_microsoft.com, honzhan_microsoft.com, howard0su_gmail.com, gallatin, hselasky, np. sepherosa_gmail.com added subscribers: freebsd-net-list, freebsd-virtualization-list. Herald added a reviewer: transport.
REVISION SUMMARY It's append_cnt based. Unless the network driver sets these two limits, its an NO-OP. For hn(4): - Set TCP ACK append limit to 1, i.e. aggregate 2 ACKs at most. Aggregate anything more than 2 hurts TCP sending performance in hyperv. This significantly improves the TCP sending performance when the number of concurrent connetion is low (2~8). And greatly stabilize the TCP sending performance in other cases. - Set TCP data segments append limit to 25. Without this limitation, hn(4) could aggregate ~45 TCP data segments for each connection (even at 64 or more connections) before dispatching them to socket code; large aggregation slows down ACK sending and eventually hurts/destabilizes TCP reception performance. This setting stabilizes and improves TCP reception performance for >4 concurrent connections significantly. Make them sysctls so they could be adjusted. REVISION DETAIL https://reviews.freebsd.org/D5185 AFFECTED FILES sys/dev/hyperv/netvsc/hv_net_vsc.h sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c sys/netinet/tcp_lro.c sys/netinet/tcp_lro.h EMAIL PREFERENCES https://reviews.freebsd.org/settings/panel/emailpreferences/ To: sepherosa_gmail.com, network, transport, adrian, delphij, royger, decui_microsoft.com, honzhan_microsoft.com, howard0su_gmail.com, gallatin, hselasky, np Cc: freebsd-virtualization-list, freebsd-net-list
diff --git a/sys/netinet/tcp_lro.h b/sys/netinet/tcp_lro.h --- a/sys/netinet/tcp_lro.h +++ b/sys/netinet/tcp_lro.h @@ -91,6 +91,8 @@ unsigned lro_cnt; unsigned lro_mbuf_count; unsigned lro_mbuf_max; + unsigned short lro_ack_append_lim; + unsigned short lro_data_append_lim; struct lro_head lro_active; struct lro_head lro_free; diff --git a/sys/netinet/tcp_lro.c b/sys/netinet/tcp_lro.c --- a/sys/netinet/tcp_lro.c +++ b/sys/netinet/tcp_lro.c @@ -88,6 +88,8 @@ lc->lro_mbuf_max = lro_mbufs; lc->lro_cnt = lro_entries; lc->ifp = ifp; + lc->lro_ack_append_lim = 0; + lc->lro_data_append_lim = 0; SLIST_INIT(&lc->lro_free); SLIST_INIT(&lc->lro_active); @@ -646,6 +648,16 @@ if (tcp_data_len == 0) { m_freem(m); + /* + * Flush this LRO entry, if this ACK should + * not be further delayed. + */ + if (lc->lro_ack_append_lim && + le->append_cnt >= lc->lro_ack_append_lim) { + SLIST_REMOVE(&lc->lro_active, le, lro_entry, + next); + tcp_lro_flush(lc, le); + } return (0); } @@ -664,9 +676,12 @@ /* * If a possible next full length packet would cause an - * overflow, pro-actively flush now. + * overflow, pro-actively flush now. And if we are asked + * to limit the data aggregate, flush this LRO entry now. */ - if (le->p_len > (65535 - lc->ifp->if_mtu)) { + if (le->p_len > (65535 - lc->ifp->if_mtu) || + (lc->lro_data_append_lim && + le->append_cnt >= lc->lro_data_append_lim)) { SLIST_REMOVE(&lc->lro_active, le, lro_entry, next); tcp_lro_flush(lc, le); } else diff --git a/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c b/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c --- a/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c +++ b/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c @@ -176,14 +176,8 @@ #define HN_CSUM_ASSIST_WIN8 (CSUM_TCP) #define HN_CSUM_ASSIST (CSUM_IP | CSUM_UDP | CSUM_TCP) -/* XXX move to netinet/tcp_lro.h */ -#define HN_LRO_HIWAT_MAX 65535 -#define HN_LRO_HIWAT_DEF HN_LRO_HIWAT_MAX -/* YYY 2*MTU is a bit rough, but should be good enough. */ -#define HN_LRO_HIWAT_MTULIM(ifp) (2 * (ifp)->if_mtu) -#define HN_LRO_HIWAT_ISVALID(sc, hiwat) \ - ((hiwat) >= HN_LRO_HIWAT_MTULIM((sc)->hn_ifp) || \ - (hiwat) <= HN_LRO_HIWAT_MAX) +#define HN_LRO_ACK_APPEND_LIM 1 +#define HN_LRO_DATA_APPEND_LIM 25 /* * Be aware that this sleepable mutex will exhibit WITNESS errors when @@ -253,27 +247,16 @@ static void hn_start_txeof(struct ifnet *ifp); static int hn_ifmedia_upd(struct ifnet *ifp); static void hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr); -#ifdef HN_LRO_HIWAT -static int hn_lro_hiwat_sysctl(SYSCTL_HANDLER_ARGS); -#endif static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS); static int hn_tx_chimney_size_sysctl(SYSCTL_HANDLER_ARGS); +static int hn_lro_append_lim_sysctl(SYSCTL_HANDLER_ARGS); static int hn_check_iplen(const struct mbuf *, int); static int hn_create_tx_ring(struct hn_softc *sc); static void hn_destroy_tx_ring(struct hn_softc *sc); static void hn_start_taskfunc(void *xsc, int pending); static void hn_txeof_taskfunc(void *xsc, int pending); static int hn_encap(struct hn_softc *, struct hn_txdesc *, struct mbuf **); -static __inline void -hn_set_lro_hiwat(struct hn_softc *sc, int hiwat) -{ - sc->hn_lro_hiwat = hiwat; -#ifdef HN_LRO_HIWAT - sc->hn_lro.lro_hiwat = sc->hn_lro_hiwat; -#endif -} - static int hn_ifmedia_upd(struct ifnet *ifp __unused) { @@ -358,7 +341,6 @@ bzero(sc, sizeof(hn_softc_t)); sc->hn_unit = unit; sc->hn_dev = dev; - sc->hn_lro_hiwat = HN_LRO_HIWAT_DEF; sc->hn_direct_tx_size = hn_direct_tx_size; if (hn_trust_hosttcp) sc->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP; @@ -442,9 +424,8 @@ /* Driver private LRO settings */ sc->hn_lro.ifp = ifp; #endif -#ifdef HN_LRO_HIWAT - sc->hn_lro.lro_hiwat = sc->hn_lro_hiwat; -#endif + sc->hn_lro.lro_ack_append_lim = HN_LRO_ACK_APPEND_LIM; + sc->hn_lro.lro_data_append_lim = HN_LRO_DATA_APPEND_LIM; #endif /* INET || INET6 */ #if __FreeBSD_version >= 1100045 @@ -471,6 +452,13 @@ hn_tx_chimney_size < sc->hn_tx_chimney_max) sc->hn_tx_chimney_size = hn_tx_chimney_size; + /* + * Always schedule transmission instead of trying + * to do direct transmission. This one gives the + * best performance so far. + */ + sc->hn_sched_tx = 1; + ctx = device_get_sysctl_ctx(dev); child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); @@ -480,11 +468,6 @@ CTLFLAG_RW, &sc->hn_lro.lro_flushed, 0, "LRO flushed"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "lro_tried", CTLFLAG_RW, &sc->hn_lro_tried, "# of LRO tries"); -#ifdef HN_LRO_HIWAT - SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_hiwat", - CTLTYPE_INT | CTLFLAG_RW, sc, 0, hn_lro_hiwat_sysctl, - "I", "LRO high watermark"); -#endif SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp", CTLTYPE_INT | CTLFLAG_RW, sc, HN_TRUST_HCSUM_TCP, hn_trust_hcsum_sysctl, "I", @@ -538,6 +521,14 @@ CTLFLAG_RW, &sc->hn_sched_tx, 0, "Always schedule transmission " "instead of doing direct transmission"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ack_append_lim", + CTLTYPE_INT | CTLFLAG_RW, &sc->hn_lro.lro_ack_append_lim, + 0, hn_lro_append_lim_sysctl, + "I", "LRO ACK appending limitation"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_data_append_lim", + CTLTYPE_INT | CTLFLAG_RW, &sc->hn_lro.lro_data_append_lim, + 0, hn_lro_append_lim_sysctl, + "I", "LRO data segments appending limitation"); if (unit == 0) { struct sysctl_ctx_list *dc_ctx; @@ -1410,13 +1401,6 @@ /* Obtain and record requested MTU */ ifp->if_mtu = ifr->ifr_mtu; - /* - * Make sure that LRO high watermark is still valid, - * after MTU change (the 2*MTU limit). - */ - if (!HN_LRO_HIWAT_ISVALID(sc, sc->hn_lro_hiwat)) - hn_set_lro_hiwat(sc, HN_LRO_HIWAT_MTULIM(ifp)); - do { NV_LOCK(sc); if (!sc->temp_unusable) { @@ -1722,27 +1706,6 @@ } #endif -#ifdef HN_LRO_HIWAT -static int -hn_lro_hiwat_sysctl(SYSCTL_HANDLER_ARGS) -{ - struct hn_softc *sc = arg1; - int hiwat, error; - - hiwat = sc->hn_lro_hiwat; - error = sysctl_handle_int(oidp, &hiwat, 0, req); - if (error || req->newptr == NULL) - return error; - - if (!HN_LRO_HIWAT_ISVALID(sc, hiwat)) - return EINVAL; - - if (sc->hn_lro_hiwat != hiwat) - hn_set_lro_hiwat(sc, hiwat); - return 0; -} -#endif /* HN_LRO_HIWAT */ - static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS) { @@ -1787,6 +1750,24 @@ } static int +hn_lro_append_lim_sysctl(SYSCTL_HANDLER_ARGS) +{ + unsigned short *lro_lim = arg1; + int lim, error; + + lim = *lro_lim; + error = sysctl_handle_int(oidp, &lim, 0, req); + if (error || req->newptr == NULL) + return error; + + if (lim < 0 || lim > 65535) + return EINVAL; + + *lro_lim = lim; + return 0; +} + +static int hn_check_iplen(const struct mbuf *m, int hoff) { const struct ip *ip; diff --git a/sys/dev/hyperv/netvsc/hv_net_vsc.h b/sys/dev/hyperv/netvsc/hv_net_vsc.h --- a/sys/dev/hyperv/netvsc/hv_net_vsc.h +++ b/sys/dev/hyperv/netvsc/hv_net_vsc.h @@ -1030,7 +1030,6 @@ struct task hn_txeof_task; struct lro_ctrl hn_lro; - int hn_lro_hiwat; /* Trust csum verification on host side */ int hn_trust_hcsum; /* HN_TRUST_HCSUM_ */
_______________________________________________ freebsd-net@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/freebsd-net To unsubscribe, send any mail to "freebsd-net-unsubscr...@freebsd.org"