Author: sephe
Date: Thu Dec 29 06:06:05 2016
New Revision: 310736
URL: https://svnweb.freebsd.org/changeset/base/310736

Log:
  MFC 308905
  
      hyperv/hn: Implement RNDIS multi-packet message support.
  
      Currently, it is only applied to packet sent through chimney sending
      buffers.  Not enabled by default yet.
  
      This one gives 20%~30% performance boost for non-TSO usage in both
      bit/packet rate tests and nginx performance test.
  
      Sponsored by:   Microsoft
      Differential Revision:  https://reviews.freebsd.org/D8560

Modified:
  stable/10/sys/dev/hyperv/netvsc/hn_rndis.c
  stable/10/sys/dev/hyperv/netvsc/if_hn.c
  stable/10/sys/dev/hyperv/netvsc/if_hnvar.h
Directory Properties:
  stable/10/   (props changed)

Modified: stable/10/sys/dev/hyperv/netvsc/hn_rndis.c
==============================================================================
--- stable/10/sys/dev/hyperv/netvsc/hn_rndis.c  Thu Dec 29 05:32:34 2016        
(r310735)
+++ stable/10/sys/dev/hyperv/netvsc/hn_rndis.c  Thu Dec 29 06:06:05 2016        
(r310736)
@@ -839,11 +839,15 @@ hn_rndis_init(struct hn_softc *sc)
                error = EIO;
                goto done;
        }
+       sc->hn_rndis_agg_size = comp->rm_pktmaxsz;
+       sc->hn_rndis_agg_pkts = comp->rm_pktmaxcnt;
+       sc->hn_rndis_agg_align = 1U << comp->rm_align;
+
        if (bootverbose) {
                if_printf(sc->hn_ifp, "RNDIS ver %u.%u, pktsz %u, pktcnt %u, "
                    "align %u\n", comp->rm_ver_major, comp->rm_ver_minor,
-                   comp->rm_pktmaxsz, comp->rm_pktmaxcnt,
-                   1U << comp->rm_align);
+                   sc->hn_rndis_agg_size, sc->hn_rndis_agg_pkts,
+                   sc->hn_rndis_agg_align);
        }
        error = 0;
 done:

Modified: stable/10/sys/dev/hyperv/netvsc/if_hn.c
==============================================================================
--- stable/10/sys/dev/hyperv/netvsc/if_hn.c     Thu Dec 29 05:32:34 2016        
(r310735)
+++ stable/10/sys/dev/hyperv/netvsc/if_hn.c     Thu Dec 29 06:06:05 2016        
(r310736)
@@ -162,10 +162,22 @@ __FBSDID("$FreeBSD$");
 #define HN_CSUM_IP6_HWASSIST(sc)       \
        ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
 
+#define HN_PKTSIZE_MIN(align)          \
+       roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
+           HN_RNDIS_PKT_LEN, (align))
+#define HN_PKTSIZE(m, align)           \
+       roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
+
 struct hn_txdesc {
 #ifndef HN_USE_TXDESC_BUFRING
        SLIST_ENTRY(hn_txdesc)          link;
 #endif
+       STAILQ_ENTRY(hn_txdesc)         agg_link;
+
+       /* Aggregated txdescs, in sending order. */
+       STAILQ_HEAD(, hn_txdesc)        agg_list;
+
+       /* The oldest packet, if transmission aggregation happens. */
        struct mbuf                     *m;
        struct hn_tx_ring               *txr;
        int                             refs;
@@ -183,6 +195,7 @@ struct hn_txdesc {
 
 #define HN_TXD_FLAG_ONLIST             0x0001
 #define HN_TXD_FLAG_DMAMAP             0x0002
+#define HN_TXD_FLAG_ONAGG              0x0004
 
 struct hn_rxinfo {
        uint32_t                        vlan_info;
@@ -262,6 +275,10 @@ static int                 hn_rxfilter_sysctl(SYSCTL_H
 static int                     hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
 static int                     hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
 static int                     hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
+static int                     hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
+static int                     hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
+static int                     hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
+static int                     hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
 
 static void                    hn_stop(struct hn_softc *);
 static void                    hn_init_locked(struct hn_softc *);
@@ -309,7 +326,7 @@ static int                  hn_create_tx_data(struct hn
 static void                    hn_fixup_tx_data(struct hn_softc *);
 static void                    hn_destroy_tx_data(struct hn_softc *);
 static void                    hn_txdesc_dmamap_destroy(struct hn_txdesc *);
-static int                     hn_encap(struct hn_tx_ring *,
+static int                     hn_encap(struct ifnet *, struct hn_tx_ring *,
                                    struct hn_txdesc *, struct mbuf **);
 static int                     hn_txpkt(struct ifnet *, struct hn_tx_ring *,
                                    struct hn_txdesc *);
@@ -318,6 +335,10 @@ static void                        
hn_set_tso_maxsize(struct 
 static bool                    hn_tx_ring_pending(struct hn_tx_ring *);
 static void                    hn_tx_ring_qflush(struct hn_tx_ring *);
 static void                    hn_resume_tx(struct hn_softc *, int);
+static void                    hn_set_txagg(struct hn_softc *);
+static void                    *hn_try_txagg(struct ifnet *,
+                                   struct hn_tx_ring *, struct hn_txdesc *,
+                                   int);
 static int                     hn_get_txswq_depth(const struct hn_tx_ring *);
 static void                    hn_txpkt_done(struct hn_nvs_sendctx *,
                                    struct hn_softc *, struct vmbus_channel *,
@@ -433,6 +454,16 @@ SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_
     &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
 #endif
 
+/* Packet transmission aggregation size limit */
+static int                     hn_tx_agg_size = -1;
+SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
+    &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
+
+/* Packet transmission aggregation count limit */
+static int                     hn_tx_agg_pkts = 0;
+SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
+    &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
+
 static u_int                   hn_cpu_index;   /* next CPU for channel */
 static struct taskqueue                *hn_tx_taskq;   /* shared TX taskqueue 
*/
 
@@ -661,6 +692,84 @@ hn_set_rxfilter(struct hn_softc *sc)
        return (error);
 }
 
+static void
+hn_set_txagg(struct hn_softc *sc)
+{
+       uint32_t size, pkts;
+       int i;
+
+       /*
+        * Setup aggregation size.
+        */
+       if (sc->hn_agg_size < 0)
+               size = UINT32_MAX;
+       else
+               size = sc->hn_agg_size;
+
+       if (sc->hn_rndis_agg_size < size)
+               size = sc->hn_rndis_agg_size;
+
+       if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
+               /* Disable */
+               size = 0;
+               pkts = 0;
+               goto done;
+       }
+
+       /* NOTE: Type of the per TX ring setting is 'int'. */
+       if (size > INT_MAX)
+               size = INT_MAX;
+
+       /* NOTE: We only aggregate packets using chimney sending buffers. */
+       if (size > (uint32_t)sc->hn_chim_szmax)
+               size = sc->hn_chim_szmax;
+
+       /*
+        * Setup aggregation packet count.
+        */
+       if (sc->hn_agg_pkts < 0)
+               pkts = UINT32_MAX;
+       else
+               pkts = sc->hn_agg_pkts;
+
+       if (sc->hn_rndis_agg_pkts < pkts)
+               pkts = sc->hn_rndis_agg_pkts;
+
+       if (pkts <= 1) {
+               /* Disable */
+               size = 0;
+               pkts = 0;
+               goto done;
+       }
+
+       /* NOTE: Type of the per TX ring setting is 'short'. */
+       if (pkts > SHRT_MAX)
+               pkts = SHRT_MAX;
+
+done:
+       /* NOTE: Type of the per TX ring setting is 'short'. */
+       if (sc->hn_rndis_agg_align > SHRT_MAX) {
+               /* Disable */
+               size = 0;
+               pkts = 0;
+       }
+
+       if (bootverbose) {
+               if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
+                   size, pkts, sc->hn_rndis_agg_align);
+       }
+
+       for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
+               struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
+
+               mtx_lock(&txr->hn_tx_lock);
+               txr->hn_agg_szmax = size;
+               txr->hn_agg_pktmax = pkts;
+               txr->hn_agg_align = sc->hn_rndis_agg_align;
+               mtx_unlock(&txr->hn_tx_lock);
+       }
+}
+
 static int
 hn_get_txswq_depth(const struct hn_tx_ring *txr)
 {
@@ -801,6 +910,12 @@ hn_attach(device_t dev)
        HN_LOCK_INIT(sc);
 
        /*
+        * Initialize these tunables once.
+        */
+       sc->hn_agg_size = hn_tx_agg_size;
+       sc->hn_agg_pkts = hn_tx_agg_pkts;
+
+       /*
         * Setup taskqueue for transmission.
         */
        if (hn_tx_taskq == NULL) {
@@ -956,6 +1071,24 @@ hn_attach(device_t dev)
        SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
            CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
            hn_rss_ind_sysctl, "IU", "RSS indirect table");
+       SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
+           CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
+           "RNDIS offered packet transmission aggregation size limit");
+       SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
+           CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
+           "RNDIS offered packet transmission aggregation count limit");
+       SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
+           CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
+           "RNDIS packet transmission aggregation alignment");
+       SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
+           CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
+           hn_txagg_size_sysctl, "I",
+           "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
+       SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
+           CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
+           hn_txagg_pkts_sysctl, "I",
+           "Packet transmission aggregation packets, "
+           "0 -- disable, -1 -- auto");
 
        /*
         * Setup the ifmedia, which has been initialized earlier.
@@ -1211,16 +1344,45 @@ hn_txdesc_put(struct hn_tx_ring *txr, st
 
        KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
            ("put an onlist txd %#x", txd->flags));
+       KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
+           ("put an onagg txd %#x", txd->flags));
 
        KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
        if (atomic_fetchadd_int(&txd->refs, -1) != 1)
                return 0;
 
+       if (!STAILQ_EMPTY(&txd->agg_list)) {
+               struct hn_txdesc *tmp_txd;
+
+               while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
+                       int freed;
+
+                       KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
+                           ("resursive aggregation on aggregated txdesc"));
+                       KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
+                           ("not aggregated txdesc"));
+                       KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
+                           ("aggregated txdesc uses dmamap"));
+                       KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
+                           ("aggregated txdesc consumes "
+                            "chimney sending buffer"));
+                       KASSERT(tmp_txd->chim_size == 0,
+                           ("aggregated txdesc has non-zero "
+                            "chimney sending size"));
+
+                       STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
+                       tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
+                       freed = hn_txdesc_put(txr, tmp_txd);
+                       KASSERT(freed, ("failed to free aggregated txdesc"));
+               }
+       }
+
        if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
                KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
                    ("chim txd uses dmamap"));
                hn_chim_free(txr->hn_sc, txd->chim_index);
                txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
+               txd->chim_size = 0;
        } else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
                bus_dmamap_sync(txr->hn_tx_data_dtag,
                    txd->data_dmap, BUS_DMASYNC_POSTWRITE);
@@ -1275,8 +1437,11 @@ hn_txdesc_get(struct hn_tx_ring *txr)
                atomic_subtract_int(&txr->hn_txdesc_avail, 1);
 #endif
                KASSERT(txd->m == NULL && txd->refs == 0 &&
+                   STAILQ_EMPTY(&txd->agg_list) &&
                    txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
+                   txd->chim_size == 0 &&
                    (txd->flags & HN_TXD_FLAG_ONLIST) &&
+                   (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
                    (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
                txd->flags &= ~HN_TXD_FLAG_ONLIST;
                txd->refs = 1;
@@ -1293,6 +1458,22 @@ hn_txdesc_hold(struct hn_txdesc *txd)
        atomic_add_int(&txd->refs, 1);
 }
 
+static __inline void
+hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
+{
+
+       KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
+           ("recursive aggregation on aggregating txdesc"));
+
+       KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
+           ("already aggregated"));
+       KASSERT(STAILQ_EMPTY(&txd->agg_list),
+           ("recursive aggregation on to-be-aggregated txdesc"));
+
+       txd->flags |= HN_TXD_FLAG_ONAGG;
+       STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
+}
+
 static bool
 hn_tx_ring_pending(struct hn_tx_ring *txr)
 {
@@ -1410,12 +1591,123 @@ hn_rndis_pktinfo_append(struct rndis_pac
        return (pi->rm_data);
 }
 
+static __inline int
+hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
+{
+       struct hn_txdesc *txd;
+       struct mbuf *m;
+       int error, pkts;
+
+       txd = txr->hn_agg_txd;
+       KASSERT(txd != NULL, ("no aggregate txdesc"));
+
+       /*
+        * Since hn_txpkt() will reset this temporary stat, save
+        * it now, so that oerrors can be updated properly, if
+        * hn_txpkt() ever fails.
+        */
+       pkts = txr->hn_stat_pkts;
+
+       /*
+        * Since txd's mbuf will _not_ be freed upon hn_txpkt()
+        * failure, save it for later freeing, if hn_txpkt() ever
+        * fails.
+        */
+       m = txd->m;
+       error = hn_txpkt(ifp, txr, txd);
+       if (__predict_false(error)) {
+               /* txd is freed, but m is not. */
+               m_freem(m);
+
+               txr->hn_flush_failed++;
+               if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
+       }
+
+       /* Reset all aggregation states. */
+       txr->hn_agg_txd = NULL;
+       txr->hn_agg_szleft = 0;
+       txr->hn_agg_pktleft = 0;
+       txr->hn_agg_prevpkt = NULL;
+
+       return (error);
+}
+
+static void *
+hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
+    int pktsize)
+{
+       void *chim;
+
+       if (txr->hn_agg_txd != NULL) {
+               if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
+                       struct hn_txdesc *agg_txd = txr->hn_agg_txd;
+                       struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
+                       int olen;
+
+                       /*
+                        * Update the previous RNDIS packet's total length,
+                        * it can be increased due to the mandatory alignment
+                        * padding for this RNDIS packet.  And update the
+                        * aggregating txdesc's chimney sending buffer size
+                        * accordingly.
+                        *
+                        * XXX
+                        * Zero-out the padding, as required by the RNDIS spec.
+                        */
+                       olen = pkt->rm_len;
+                       pkt->rm_len = roundup2(olen, txr->hn_agg_align);
+                       agg_txd->chim_size += pkt->rm_len - olen;
+
+                       /* Link this txdesc to the parent. */
+                       hn_txdesc_agg(agg_txd, txd);
+
+                       chim = (uint8_t *)pkt + pkt->rm_len;
+                       /* Save the current packet for later fixup. */
+                       txr->hn_agg_prevpkt = chim;
+
+                       txr->hn_agg_pktleft--;
+                       txr->hn_agg_szleft -= pktsize;
+                       if (txr->hn_agg_szleft <=
+                           HN_PKTSIZE_MIN(txr->hn_agg_align)) {
+                               /*
+                                * Probably can't aggregate more packets,
+                                * flush this aggregating txdesc proactively.
+                                */
+                               txr->hn_agg_pktleft = 0;
+                       }
+                       /* Done! */
+                       return (chim);
+               }
+               hn_flush_txagg(ifp, txr);
+       }
+       KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
+
+       txr->hn_tx_chimney_tried++;
+       txd->chim_index = hn_chim_alloc(txr->hn_sc);
+       if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
+               return (NULL);
+       txr->hn_tx_chimney++;
+
+       chim = txr->hn_sc->hn_chim +
+           (txd->chim_index * txr->hn_sc->hn_chim_szmax);
+
+       if (txr->hn_agg_pktmax > 1 &&
+           txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
+               txr->hn_agg_txd = txd;
+               txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
+               txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
+               txr->hn_agg_prevpkt = chim;
+       }
+       return (chim);
+}
+
 /*
  * NOTE:
  * If this function fails, then both txd and m_head0 will be freed.
  */
 static int
-hn_encap(struct hn_tx_ring *txr, struct hn_txdesc *txd, struct mbuf **m_head0)
+hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
+    struct mbuf **m_head0)
 {
        bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
        int error, nsegs, i;
@@ -1423,33 +1715,30 @@ hn_encap(struct hn_tx_ring *txr, struct 
        struct rndis_packet_msg *pkt;
        uint32_t *pi_data;
        void *chim = NULL;
-       int pktlen;
+       int pkt_hlen, pkt_size;
 
        pkt = txd->rndis_pkt;
-       if (m_head->m_pkthdr.len + HN_RNDIS_PKT_LEN < txr->hn_chim_size) {
-               /*
-                * This packet is small enough to fit into a chimney sending
-                * buffer.  Try allocating one chimney sending buffer now.
-                */
-               txr->hn_tx_chimney_tried++;
-               txd->chim_index = hn_chim_alloc(txr->hn_sc);
-               if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
-                       chim = txr->hn_sc->hn_chim +
-                           (txd->chim_index * txr->hn_sc->hn_chim_szmax);
-                       /*
-                        * Directly fill the chimney sending buffer w/ the
-                        * RNDIS packet message.
-                        */
+       pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
+       if (pkt_size < txr->hn_chim_size) {
+               chim = hn_try_txagg(ifp, txr, txd, pkt_size);
+               if (chim != NULL)
                        pkt = chim;
-               }
+       } else {
+               if (txr->hn_agg_txd != NULL)
+                       hn_flush_txagg(ifp, txr);
        }
 
        pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
        pkt->rm_len = sizeof(*pkt) + m_head->m_pkthdr.len;
        pkt->rm_dataoffset = sizeof(*pkt);
        pkt->rm_datalen = m_head->m_pkthdr.len;
+       pkt->rm_oobdataoffset = 0;
+       pkt->rm_oobdatalen = 0;
+       pkt->rm_oobdataelements = 0;
        pkt->rm_pktinfooffset = sizeof(*pkt);
        pkt->rm_pktinfolen = 0;
+       pkt->rm_vchandle = 0;
+       pkt->rm_reserved = 0;
 
        if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
                /*
@@ -1510,7 +1799,7 @@ hn_encap(struct hn_tx_ring *txr, struct 
                        *pi_data |= NDIS_TXCSUM_INFO_UDPCS;
        }
 
-       pktlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
+       pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
        /* Convert RNDIS packet message offsets */
        pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt->rm_dataoffset);
        pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
@@ -1519,25 +1808,36 @@ hn_encap(struct hn_tx_ring *txr, struct 
         * Fast path: Chimney sending.
         */
        if (chim != NULL) {
-               KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
-                   ("chimney buffer is not used"));
-               KASSERT(pkt == chim, ("RNDIS pkt not in chimney buffer"));
+               struct hn_txdesc *tgt_txd = txd;
+
+               if (txr->hn_agg_txd != NULL) {
+                       tgt_txd = txr->hn_agg_txd;
+#ifdef INVARIANTS
+                       *m_head0 = NULL;
+#endif
+               }
+
+               KASSERT(pkt == chim,
+                   ("RNDIS pkt not in chimney sending buffer"));
+               KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
+                   ("chimney sending buffer is not used"));
+               tgt_txd->chim_size += pkt->rm_len;
 
                m_copydata(m_head, 0, m_head->m_pkthdr.len,
-                   ((uint8_t *)chim) + pktlen);
+                   ((uint8_t *)chim) + pkt_hlen);
 
-               txd->chim_size = pkt->rm_len;
                txr->hn_gpa_cnt = 0;
-               txr->hn_tx_chimney++;
                txr->hn_sendpkt = hn_txpkt_chim;
                goto done;
        }
+
+       KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
        KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
            ("chimney buffer is used"));
        KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
 
        error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
-       if (error) {
+       if (__predict_false(error)) {
                int freed;
 
                /*
@@ -1551,7 +1851,7 @@ hn_encap(struct hn_tx_ring *txr, struct 
                    ("fail to free txd upon txdma error"));
 
                txr->hn_txdma_failed++;
-               if_inc_counter(txr->hn_sc->hn_ifp, IFCOUNTER_OERRORS, 1);
+               if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
                return error;
        }
        *m_head0 = m_head;
@@ -1562,7 +1862,7 @@ hn_encap(struct hn_tx_ring *txr, struct 
        /* send packet with page buffer */
        txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
        txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
-       txr->hn_gpa[0].gpa_len = pktlen;
+       txr->hn_gpa[0].gpa_len = pkt_hlen;
 
        /*
         * Fill the page buffers with mbuf info after the page
@@ -1585,6 +1885,12 @@ done:
        /* Set the completion routine */
        hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
 
+       /* Update temporary stats for later use. */
+       txr->hn_stat_pkts++;
+       txr->hn_stat_size += m_head->m_pkthdr.len;
+       if (m_head->m_flags & M_MCAST)
+               txr->hn_stat_mcasts++;
+
        return 0;
 }
 
@@ -1600,23 +1906,34 @@ hn_txpkt(struct ifnet *ifp, struct hn_tx
 
 again:
        /*
-        * Make sure that txd is not freed before ETHER_BPF_MTAP.
+        * Make sure that this txd and any aggregated txds are not freed
+        * before ETHER_BPF_MTAP.
         */
        hn_txdesc_hold(txd);
        error = txr->hn_sendpkt(txr, txd);
        if (!error) {
-               ETHER_BPF_MTAP(ifp, txd->m);
-               if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
+               if (bpf_peers_present(ifp->if_bpf)) {
+                       const struct hn_txdesc *tmp_txd;
+
+                       ETHER_BPF_MTAP(ifp, txd->m);
+                       STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
+                               ETHER_BPF_MTAP(ifp, tmp_txd->m);
+               }
+
+               if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
 #ifdef HN_IFSTART_SUPPORT
                if (!hn_use_if_start)
 #endif
                {
                        if_inc_counter(ifp, IFCOUNTER_OBYTES,
-                           txd->m->m_pkthdr.len);
-                       if (txd->m->m_flags & M_MCAST)
-                               if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
+                           txr->hn_stat_size);
+                       if (txr->hn_stat_mcasts != 0) {
+                               if_inc_counter(ifp, IFCOUNTER_OMCASTS,
+                                   txr->hn_stat_mcasts);
+                       }
                }
-               txr->hn_pkts++;
+               txr->hn_pkts += txr->hn_stat_pkts;
+               txr->hn_sends++;
        }
        hn_txdesc_put(txr, txd);
 
@@ -1656,7 +1973,13 @@ again:
 
                txr->hn_send_failed++;
        }
-       return error;
+
+       /* Reset temporary stats, after this sending is done. */
+       txr->hn_stat_size = 0;
+       txr->hn_stat_pkts = 0;
+       txr->hn_stat_mcasts = 0;
+
+       return (error);
 }
 
 /*
@@ -2438,6 +2761,64 @@ hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARG
 }
 
 static int
+hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
+{
+       struct hn_softc *sc = arg1;
+       int error, size;
+
+       size = sc->hn_agg_size;
+       error = sysctl_handle_int(oidp, &size, 0, req);
+       if (error || req->newptr == NULL)
+               return (error);
+
+       HN_LOCK(sc);
+       sc->hn_agg_size = size;
+       hn_set_txagg(sc);
+       HN_UNLOCK(sc);
+
+       return (0);
+}
+
+static int
+hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
+{
+       struct hn_softc *sc = arg1;
+       int error, pkts;
+
+       pkts = sc->hn_agg_pkts;
+       error = sysctl_handle_int(oidp, &pkts, 0, req);
+       if (error || req->newptr == NULL)
+               return (error);
+
+       HN_LOCK(sc);
+       sc->hn_agg_pkts = pkts;
+       hn_set_txagg(sc);
+       HN_UNLOCK(sc);
+
+       return (0);
+}
+
+static int
+hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
+{
+       struct hn_softc *sc = arg1;
+       int pkts;
+
+       pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
+       return (sysctl_handle_int(oidp, &pkts, 0, req));
+}
+
+static int
+hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
+{
+       struct hn_softc *sc = arg1;
+       int align;
+
+       align = sc->hn_tx_ring[0].hn_agg_align;
+       return (sysctl_handle_int(oidp, &align, 0, req));
+}
+
+static int
 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
 {
        struct hn_softc *sc = arg1;
@@ -2980,6 +3361,7 @@ hn_tx_ring_create(struct hn_softc *sc, i
 
                txd->txr = txr;
                txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
+               STAILQ_INIT(&txd->agg_list);
 
                /*
                 * Allocate and load RNDIS packet message.
@@ -3063,6 +3445,8 @@ hn_tx_ring_create(struct hn_softc *sc, i
                        SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
                            CTLFLAG_RW, &txr->hn_pkts,
                            "# of packets transmitted");
+                       SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
+                           CTLFLAG_RW, &txr->hn_sends, "# of sends");
                }
        }
 
@@ -3177,6 +3561,11 @@ hn_create_tx_data(struct hn_softc *sc, i
            CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
            __offsetof(struct hn_tx_ring, hn_txdma_failed),
            hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
+       SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
+           CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
+           __offsetof(struct hn_tx_ring, hn_flush_failed),
+           hn_tx_stat_ulong_sysctl, "LU",
+           "# of packet transmission aggregation flush failure");
        SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
            CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
            __offsetof(struct hn_tx_ring, hn_tx_collapsed),
@@ -3213,6 +3602,17 @@ hn_create_tx_data(struct hn_softc *sc, i
            CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
        SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
            CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
+       SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
+           CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
+           "Applied packet transmission aggregation size");
+       SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
+           CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
+           hn_txagg_pktmax_sysctl, "I",
+           "Applied packet transmission aggregation packets");
+       SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
+           CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
+           hn_txagg_align_sysctl, "I",
+           "Applied packet transmission aggregation alignment");
 
        return 0;
 }
@@ -3332,18 +3732,20 @@ hn_start_locked(struct hn_tx_ring *txr, 
 {
        struct hn_softc *sc = txr->hn_sc;
        struct ifnet *ifp = sc->hn_ifp;
+       int sched = 0;
 
        KASSERT(hn_use_if_start,
            ("hn_start_locked is called, when if_start is disabled"));
        KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
        mtx_assert(&txr->hn_tx_lock, MA_OWNED);
+       KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
 
        if (__predict_false(txr->hn_suspended))
-               return 0;
+               return (0);
 
        if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
            IFF_DRV_RUNNING)
-               return 0;
+               return (0);
 
        while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
                struct hn_txdesc *txd;
@@ -3361,7 +3763,8 @@ hn_start_locked(struct hn_tx_ring *txr, 
                         * following up packets) to tx taskqueue.
                         */
                        IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
-                       return 1;
+                       sched = 1;
+                       break;
                }
 
 #if defined(INET6) || defined(INET)
@@ -3382,21 +3785,50 @@ hn_start_locked(struct hn_tx_ring *txr, 
                        break;
                }
 
-               error = hn_encap(txr, txd, &m_head);
+               error = hn_encap(ifp, txr, txd, &m_head);
                if (error) {
                        /* Both txd and m_head are freed */
+                       KASSERT(txr->hn_agg_txd == NULL,
+                           ("encap failed w/ pending aggregating txdesc"));
                        continue;
                }
 
-               error = hn_txpkt(ifp, txr, txd);
-               if (__predict_false(error)) {
-                       /* txd is freed, but m_head is not */
-                       IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
-                       atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
-                       break;
+               if (txr->hn_agg_pktleft == 0) {
+                       if (txr->hn_agg_txd != NULL) {
+                               KASSERT(m_head == NULL,
+                                   ("pending mbuf for aggregating txdesc"));
+                               error = hn_flush_txagg(ifp, txr);
+                               if (__predict_false(error)) {
+                                       atomic_set_int(&ifp->if_drv_flags,
+                                           IFF_DRV_OACTIVE);
+                                       break;
+                               }
+                       } else {
+                               KASSERT(m_head != NULL, ("mbuf was freed"));
+                               error = hn_txpkt(ifp, txr, txd);
+                               if (__predict_false(error)) {
+                                       /* txd is freed, but m_head is not */
+                                       IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
+                                       atomic_set_int(&ifp->if_drv_flags,
+                                           IFF_DRV_OACTIVE);
+                                       break;
+                               }
+                       }
+               }
+#ifdef INVARIANTS
+               else {
+                       KASSERT(txr->hn_agg_txd != NULL,
+                           ("no aggregating txdesc"));
+                       KASSERT(m_head == NULL,
+                           ("pending mbuf for aggregating txdesc"));
                }
+#endif
        }
-       return 0;
+
+       /* Flush pending aggerated transmission. */
+       if (txr->hn_agg_txd != NULL)
+               hn_flush_txagg(ifp, txr);
+       return (sched);
 }
 
 static void
@@ -3473,18 +3905,20 @@ hn_xmit(struct hn_tx_ring *txr, int len)
        struct hn_softc *sc = txr->hn_sc;
        struct ifnet *ifp = sc->hn_ifp;
        struct mbuf *m_head;
+       int sched = 0;
 
        mtx_assert(&txr->hn_tx_lock, MA_OWNED);
 #ifdef HN_IFSTART_SUPPORT
        KASSERT(hn_use_if_start == 0,
            ("hn_xmit is called, when if_start is enabled"));
 #endif
+       KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
 
        if (__predict_false(txr->hn_suspended))
-               return 0;
+               return (0);
 
        if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
-               return 0;
+               return (0);
 
        while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
                struct hn_txdesc *txd;
@@ -3497,7 +3931,8 @@ hn_xmit(struct hn_tx_ring *txr, int len)
                         * following up packets) to tx taskqueue.
                         */
                        drbr_putback(ifp, txr->hn_mbuf_br, m_head);
-                       return 1;
+                       sched = 1;
+                       break;
                }
 
                txd = hn_txdesc_get(txr);
@@ -3508,25 +3943,53 @@ hn_xmit(struct hn_tx_ring *txr, int len)
                        break;
                }
 
-               error = hn_encap(txr, txd, &m_head);
+               error = hn_encap(ifp, txr, txd, &m_head);
                if (error) {
                        /* Both txd and m_head are freed; discard */
+                       KASSERT(txr->hn_agg_txd == NULL,
+                           ("encap failed w/ pending aggregating txdesc"));
                        drbr_advance(ifp, txr->hn_mbuf_br);
                        continue;
                }
 
-               error = hn_txpkt(ifp, txr, txd);
-               if (__predict_false(error)) {
-                       /* txd is freed, but m_head is not */
-                       drbr_putback(ifp, txr->hn_mbuf_br, m_head);
-                       txr->hn_oactive = 1;
-                       break;
+               if (txr->hn_agg_pktleft == 0) {
+                       if (txr->hn_agg_txd != NULL) {
+                               KASSERT(m_head == NULL,
+                                   ("pending mbuf for aggregating txdesc"));
+                               error = hn_flush_txagg(ifp, txr);
+                               if (__predict_false(error)) {
+                                       txr->hn_oactive = 1;
+                                       break;
+                               }
+                       } else {
+                               KASSERT(m_head != NULL, ("mbuf was freed"));
+                               error = hn_txpkt(ifp, txr, txd);
+                               if (__predict_false(error)) {
+                                       /* txd is freed, but m_head is not */
+                                       drbr_putback(ifp, txr->hn_mbuf_br,
+                                           m_head);
+                                       txr->hn_oactive = 1;
+                                       break;
+                               }
+                       }
                }
+#ifdef INVARIANTS
+               else {
+                       KASSERT(txr->hn_agg_txd != NULL,
+                           ("no aggregating txdesc"));
+                       KASSERT(m_head == NULL,
+                           ("pending mbuf for aggregating txdesc"));
+               }
+#endif
 
                /* Sent */
                drbr_advance(ifp, txr->hn_mbuf_br);
        }
-       return 0;
+
+       /* Flush pending aggerated transmission. */
+       if (txr->hn_agg_txd != NULL)
+               hn_flush_txagg(ifp, txr);
+       return (sched);
 }
 
 static int
@@ -4004,6 +4467,11 @@ back:
        if (error)
                return (error);
 
+       /*
+        * Fixup transmission aggregation setup.
+        */
+       hn_set_txagg(sc);
+
        sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
        return (0);
 }

Modified: stable/10/sys/dev/hyperv/netvsc/if_hnvar.h
==============================================================================
--- stable/10/sys/dev/hyperv/netvsc/if_hnvar.h  Thu Dec 29 05:32:34 2016        
(r310735)
+++ stable/10/sys/dev/hyperv/netvsc/if_hnvar.h  Thu Dec 29 06:06:05 2016        
(r310736)
@@ -125,6 +125,22 @@ struct hn_tx_ring {
        bus_dma_tag_t   hn_tx_data_dtag;
        uint64_t        hn_csum_assist;
 
+       /* Applied packet transmission aggregation limits. */
+       int             hn_agg_szmax;
+       short           hn_agg_pktmax;
+       short           hn_agg_align;
+
+       /* Packet transmission aggregation states. */
+       struct hn_txdesc *hn_agg_txd;
+       int             hn_agg_szleft;
+       short           hn_agg_pktleft;
+       struct rndis_packet_msg *hn_agg_prevpkt;
+
+       /* Temporary stats for each sends. */
+       int             hn_stat_size;
+       short           hn_stat_pkts;
+       short           hn_stat_mcasts;
+
        int             (*hn_sendpkt)(struct hn_tx_ring *, struct hn_txdesc *);
        int             hn_suspended;
        int             hn_gpa_cnt;
@@ -137,6 +153,8 @@ struct hn_tx_ring {
        u_long          hn_tx_chimney_tried;
        u_long          hn_tx_chimney;
        u_long          hn_pkts;
+       u_long          hn_sends;
+       u_long          hn_flush_failed;
 
        /* Rarely used stuffs */
        struct hn_txdesc *hn_txdesc;
@@ -181,6 +199,10 @@ struct hn_softc {
        uint32_t        hn_nvs_ver;
        uint32_t        hn_rx_filter;
 
+       /* Packet transmission aggregation user settings. */
+       int                     hn_agg_size;
+       int                     hn_agg_pkts;
+
        struct taskqueue        *hn_mgmt_taskq;
        struct taskqueue        *hn_mgmt_taskq0;
        struct task             hn_link_task;
@@ -201,6 +223,9 @@ struct hn_softc {
        uint32_t                hn_ndis_ver;
        int                     hn_ndis_tso_szmax;
        int                     hn_ndis_tso_sgmin;
+       uint32_t                hn_rndis_agg_size;
+       uint32_t                hn_rndis_agg_pkts;
+       uint32_t                hn_rndis_agg_align;
 
        int                     hn_rss_ind_size;
        uint32_t                hn_rss_hash;    /* NDIS_HASH_ */
_______________________________________________
svn-src-stable-10@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-stable-10
To unsubscribe, send any mail to "svn-src-stable-10-unsubscr...@freebsd.org"

Reply via email to