Author: luigi
Date: Wed Aug 20 23:34:36 2014
New Revision: 270252
URL: http://svnweb.freebsd.org/changeset/base/270252

Log:
  MFC 270063: update of netmap code
  (vtnet and cxgbe not merged yet because we need some other mfc first)

Added:
  stable/10/sys/dev/netmap/if_vtnet_netmap.h   (contents, props changed)
  stable/10/sys/dev/netmap/netmap_monitor.c   (contents, props changed)
Modified:
  stable/10/sys/conf/files
  stable/10/sys/dev/e1000/if_em.c
  stable/10/sys/dev/e1000/if_igb.c
  stable/10/sys/dev/e1000/if_lem.c
  stable/10/sys/dev/ixgbe/ixgbe.c
  stable/10/sys/dev/netmap/if_em_netmap.h
  stable/10/sys/dev/netmap/if_igb_netmap.h
  stable/10/sys/dev/netmap/if_lem_netmap.h
  stable/10/sys/dev/netmap/if_re_netmap.h
  stable/10/sys/dev/netmap/ixgbe_netmap.h
  stable/10/sys/dev/netmap/netmap.c
  stable/10/sys/dev/netmap/netmap_freebsd.c
  stable/10/sys/dev/netmap/netmap_generic.c
  stable/10/sys/dev/netmap/netmap_kern.h
  stable/10/sys/dev/netmap/netmap_mbq.h
  stable/10/sys/dev/netmap/netmap_mem2.c
  stable/10/sys/dev/netmap/netmap_mem2.h
  stable/10/sys/dev/netmap/netmap_offloadings.c
  stable/10/sys/dev/netmap/netmap_pipe.c
  stable/10/sys/dev/netmap/netmap_vale.c
  stable/10/tools/tools/netmap/pkt-gen.c
  stable/10/tools/tools/netmap/vale-ctl.c

Modified: stable/10/sys/conf/files
==============================================================================
--- stable/10/sys/conf/files    Wed Aug 20 23:29:34 2014        (r270251)
+++ stable/10/sys/conf/files    Wed Aug 20 23:34:36 2014        (r270252)
@@ -1933,6 +1933,7 @@ dev/netmap/netmap_freebsd.c       optional net
 dev/netmap/netmap_generic.c    optional netmap
 dev/netmap/netmap_mbq.c                optional netmap
 dev/netmap/netmap_mem2.c       optional netmap
+dev/netmap/netmap_monitor.c    optional netmap
 dev/netmap/netmap_offloadings.c        optional netmap
 dev/netmap/netmap_pipe.c       optional netmap
 dev/netmap/netmap_vale.c       optional netmap

Modified: stable/10/sys/dev/e1000/if_em.c
==============================================================================
--- stable/10/sys/dev/e1000/if_em.c     Wed Aug 20 23:29:34 2014        
(r270251)
+++ stable/10/sys/dev/e1000/if_em.c     Wed Aug 20 23:34:36 2014        
(r270252)
@@ -3389,10 +3389,10 @@ em_setup_transmit_ring(struct tx_ring *t
                        uint64_t paddr;
                        void *addr;
 
-                       addr = PNMB(slot + si, &paddr);
+                       addr = PNMB(na, slot + si, &paddr);
                        txr->tx_base[i].buffer_addr = htole64(paddr);
                        /* reload the map for netmap mode */
-                       netmap_load_map(txr->txtag, txbuf->map, addr);
+                       netmap_load_map(na, txr->txtag, txbuf->map, addr);
                }
 #endif /* DEV_NETMAP */
 
@@ -4131,8 +4131,8 @@ em_setup_receive_ring(struct rx_ring *rx
                        uint64_t paddr;
                        void *addr;
 
-                       addr = PNMB(slot + si, &paddr);
-                       netmap_load_map(rxr->rxtag, rxbuf->map, addr);
+                       addr = PNMB(na, slot + si, &paddr);
+                       netmap_load_map(na, rxr->rxtag, rxbuf->map, addr);
                        /* Update descriptor */
                        rxr->rx_base[j].buffer_addr = htole64(paddr);
                        continue;

Modified: stable/10/sys/dev/e1000/if_igb.c
==============================================================================
--- stable/10/sys/dev/e1000/if_igb.c    Wed Aug 20 23:29:34 2014        
(r270251)
+++ stable/10/sys/dev/e1000/if_igb.c    Wed Aug 20 23:34:36 2014        
(r270252)
@@ -3531,7 +3531,7 @@ igb_setup_transmit_ring(struct tx_ring *
                if (slot) {
                        int si = netmap_idx_n2k(&na->tx_rings[txr->me], i);
                        /* no need to set the address */
-                       netmap_load_map(txr->txtag, txbuf->map, NMB(slot + si));
+                       netmap_load_map(na, txr->txtag, txbuf->map, NMB(na, 
slot + si));
                }
 #endif /* DEV_NETMAP */
                /* clear the watch index */
@@ -4335,8 +4335,8 @@ igb_setup_receive_ring(struct rx_ring *r
                        uint64_t paddr;
                        void *addr;
 
-                       addr = PNMB(slot + sj, &paddr);
-                       netmap_load_map(rxr->ptag, rxbuf->pmap, addr);
+                       addr = PNMB(na, slot + sj, &paddr);
+                       netmap_load_map(na, rxr->ptag, rxbuf->pmap, addr);
                        /* Update descriptor */
                        rxr->rx_base[j].read.pkt_addr = htole64(paddr);
                        continue;

Modified: stable/10/sys/dev/e1000/if_lem.c
==============================================================================
--- stable/10/sys/dev/e1000/if_lem.c    Wed Aug 20 23:29:34 2014        
(r270251)
+++ stable/10/sys/dev/e1000/if_lem.c    Wed Aug 20 23:34:36 2014        
(r270252)
@@ -32,6 +32,15 @@
 ******************************************************************************/
 /*$FreeBSD$*/
 
+/*
+ * Uncomment the following extensions for better performance in a VM,
+ * especially if you have support in the hypervisor.
+ * See http://info.iet.unipi.it/~luigi/netmap/
+ */
+// #define BATCH_DISPATCH
+// #define NIC_SEND_COMBINING
+// #define NIC_PARAVIRT        /* enable virtio-like synchronization */
+
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
@@ -289,6 +298,10 @@ static int lem_tx_int_delay_dflt = EM_TI
 static int lem_rx_int_delay_dflt = EM_TICKS_TO_USECS(EM_RDTR);
 static int lem_tx_abs_int_delay_dflt = EM_TICKS_TO_USECS(EM_TADV);
 static int lem_rx_abs_int_delay_dflt = EM_TICKS_TO_USECS(EM_RADV);
+/*
+ * increase lem_rxd and lem_txd to at least 2048 in netmap mode
+ * for better performance.
+ */
 static int lem_rxd = EM_DEFAULT_RXD;
 static int lem_txd = EM_DEFAULT_TXD;
 static int lem_smart_pwr_down = FALSE;
@@ -458,6 +471,20 @@ lem_attach(device_t dev)
            "max number of rx packets to process", &adapter->rx_process_limit,
            lem_rx_process_limit);
 
+#ifdef NIC_SEND_COMBINING
+       /* Sysctls to control mitigation */
+       lem_add_rx_process_limit(adapter, "sc_enable",
+           "driver TDT mitigation", &adapter->sc_enable, 0);
+#endif /* NIC_SEND_COMBINING */
+#ifdef BATCH_DISPATCH
+       lem_add_rx_process_limit(adapter, "batch_enable",
+           "driver rx batch", &adapter->batch_enable, 0);
+#endif /* BATCH_DISPATCH */
+#ifdef NIC_PARAVIRT
+       lem_add_rx_process_limit(adapter, "rx_retries",
+           "driver rx retries", &adapter->rx_retries, 0);
+#endif /* NIC_PARAVIRT */
+
         /* Sysctl for setting the interface flow control */
        lem_set_flow_cntrl(adapter, "flow_control",
            "flow control setting",
@@ -515,6 +542,49 @@ lem_attach(device_t dev)
         */
        adapter->hw.mac.report_tx_early = 1;
 
+#ifdef NIC_PARAVIRT
+       device_printf(dev, "driver supports paravirt, subdev 0x%x\n",
+               adapter->hw.subsystem_device_id);
+       if (adapter->hw.subsystem_device_id == E1000_PARA_SUBDEV) {
+               uint64_t bus_addr;
+
+               device_printf(dev, "paravirt support on dev %p\n", adapter);
+               tsize = 4096; // XXX one page for the csb
+               if (lem_dma_malloc(adapter, tsize, &adapter->csb_mem, 
BUS_DMA_NOWAIT)) {
+                       device_printf(dev, "Unable to allocate csb memory\n");
+                       error = ENOMEM;
+                       goto err_csb;
+               }
+               /* Setup the Base of the CSB */
+               adapter->csb = (struct paravirt_csb 
*)adapter->csb_mem.dma_vaddr;
+               /* force the first kick */
+               adapter->csb->host_need_txkick = 1; /* txring empty */
+               adapter->csb->guest_need_rxkick = 1; /* no rx packets */
+               bus_addr = adapter->csb_mem.dma_paddr;
+               lem_add_rx_process_limit(adapter, "csb_on",
+                   "enable paravirt.", &adapter->csb->guest_csb_on, 0);
+               lem_add_rx_process_limit(adapter, "txc_lim",
+                   "txc_lim", &adapter->csb->host_txcycles_lim, 1);
+
+               /* some stats */
+#define PA_SC(name, var, val)          \
+       lem_add_rx_process_limit(adapter, name, name, var, val)
+               PA_SC("host_need_txkick",&adapter->csb->host_need_txkick, 1);
+               PA_SC("host_rxkick_at",&adapter->csb->host_rxkick_at, ~0);
+               PA_SC("guest_need_txkick",&adapter->csb->guest_need_txkick, 0);
+               PA_SC("guest_need_rxkick",&adapter->csb->guest_need_rxkick, 1);
+               PA_SC("tdt_reg_count",&adapter->tdt_reg_count, 0);
+               PA_SC("tdt_csb_count",&adapter->tdt_csb_count, 0);
+               PA_SC("tdt_int_count",&adapter->tdt_int_count, 0);
+               PA_SC("guest_need_kick_count",&adapter->guest_need_kick_count, 
0);
+               /* tell the host where the block is */
+               E1000_WRITE_REG(&adapter->hw, E1000_CSBAH,
+                       (u32)(bus_addr >> 32));
+               E1000_WRITE_REG(&adapter->hw, E1000_CSBAL,
+                       (u32)bus_addr);
+       }
+#endif /* NIC_PARAVIRT */
+
        tsize = roundup2(adapter->num_tx_desc * sizeof(struct e1000_tx_desc),
            EM_DBA_ALIGN);
 
@@ -673,6 +743,11 @@ err_hw_init:
 err_rx_desc:
        lem_dma_free(adapter, &adapter->txdma);
 err_tx_desc:
+#ifdef NIC_PARAVIRT
+       lem_dma_free(adapter, &adapter->csb_mem);
+err_csb:
+#endif /* NIC_PARAVIRT */
+
 err_pci:
        if (adapter->ifp != NULL)
                if_free(adapter->ifp);
@@ -760,6 +835,12 @@ lem_detach(device_t dev)
                adapter->rx_desc_base = NULL;
        }
 
+#ifdef NIC_PARAVIRT
+       if (adapter->csb) {
+               lem_dma_free(adapter, &adapter->csb_mem);
+               adapter->csb = NULL;
+       }
+#endif /* NIC_PARAVIRT */
        lem_release_hw_control(adapter);
        free(adapter->mta, M_DEVBUF);
        EM_TX_LOCK_DESTROY(adapter);
@@ -869,6 +950,16 @@ lem_start_locked(struct ifnet *ifp)
        }
        if (adapter->num_tx_desc_avail <= EM_TX_OP_THRESHOLD)
                ifp->if_drv_flags |= IFF_DRV_OACTIVE;
+#ifdef NIC_PARAVIRT
+       if (if_getdrvflags(ifp) & IFF_DRV_OACTIVE && adapter->csb &&
+           adapter->csb->guest_csb_on &&
+           !(adapter->csb->guest_need_txkick & 1))  {
+               adapter->csb->guest_need_txkick = 1;
+               adapter->guest_need_kick_count++;
+               // XXX memory barrier
+               lem_txeof(adapter); // XXX possibly clear IFF_DRV_OACTIVE
+       }
+#endif /* NIC_PARAVIRT */
 
        return;
 }
@@ -1715,6 +1806,37 @@ lem_xmit(struct adapter *adapter, struct
         */
        bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map,
            BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
+
+#ifdef NIC_PARAVIRT
+       if (adapter->csb) {
+               adapter->csb->guest_tdt = i;
+               /* XXX memory barrier ? */
+               if (adapter->csb->guest_csb_on &&
+                   !(adapter->csb->host_need_txkick & 1)) {
+                       /* XXX maybe useless
+                        * clean the ring. maybe do it before ?
+                        * maybe a little bit of histeresys ?
+                        */
+                       if (adapter->num_tx_desc_avail <= 64) {// XXX
+                               lem_txeof(adapter);
+                       }
+                       return (0);
+               }
+       }
+#endif /* NIC_PARAVIRT */
+
+#ifdef NIC_SEND_COMBINING
+       if (adapter->sc_enable) {
+               if (adapter->shadow_tdt & MIT_PENDING_INT) {
+                       /* signal intr and data pending */
+                       adapter->shadow_tdt = MIT_PENDING_TDT | (i & 0xffff);
+                       return (0);
+               } else {
+                       adapter->shadow_tdt = MIT_PENDING_INT;
+               }
+       }
+#endif /* NIC_SEND_COMBINING */
+
        if (adapter->hw.mac.type == e1000_82547 &&
            adapter->link_duplex == HALF_DUPLEX)
                lem_82547_move_tail(adapter);
@@ -1995,6 +2117,20 @@ lem_local_timer(void *arg)
 
        lem_smartspeed(adapter);
 
+#ifdef NIC_PARAVIRT
+       /* recover space if needed */
+       if (adapter->csb && adapter->csb->guest_csb_on &&
+           (adapter->watchdog_check == TRUE) &&
+           (ticks - adapter->watchdog_time > EM_WATCHDOG) &&
+           (adapter->num_tx_desc_avail != adapter->num_tx_desc) ) {
+               lem_txeof(adapter);
+               /*
+                * lem_txeof() normally (except when space in the queue
+                * runs low XXX) cleans watchdog_check so that
+                * we do not hung.
+                */
+       }
+#endif /* NIC_PARAVIRT */
        /*
         * We check the watchdog: the time since
         * the last TX descriptor was cleaned.
@@ -2677,10 +2813,10 @@ lem_setup_transmit_structures(struct ada
                        uint64_t paddr;
                        void *addr;
 
-                       addr = PNMB(slot + si, &paddr);
+                       addr = PNMB(na, slot + si, &paddr);
                        adapter->tx_desc_base[i].buffer_addr = htole64(paddr);
                        /* reload the map for netmap mode */
-                       netmap_load_map(adapter->txtag, tx_buffer->map, addr);
+                       netmap_load_map(na, adapter->txtag, tx_buffer->map, 
addr);
                }
 #endif /* DEV_NETMAP */
                tx_buffer->next_eop = -1;
@@ -3055,6 +3191,16 @@ lem_txeof(struct adapter *adapter)
         adapter->next_tx_to_clean = first;
         adapter->num_tx_desc_avail = num_avail;
 
+#ifdef NIC_SEND_COMBINING
+       if ((adapter->shadow_tdt & MIT_PENDING_TDT) == MIT_PENDING_TDT) {
+               /* a tdt write is pending, do it */
+               E1000_WRITE_REG(&adapter->hw, E1000_TDT(0),
+                       0xffff & adapter->shadow_tdt);
+               adapter->shadow_tdt = MIT_PENDING_INT;
+       } else {
+               adapter->shadow_tdt = 0; // disable
+       }
+#endif /* NIC_SEND_COMBINING */
         /*
          * If we have enough room, clear IFF_DRV_OACTIVE to
          * tell the stack that it is OK to send packets.
@@ -3062,6 +3208,12 @@ lem_txeof(struct adapter *adapter)
          */
         if (adapter->num_tx_desc_avail > EM_TX_CLEANUP_THRESHOLD) {            
    
                 ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
+#ifdef NIC_PARAVIRT
+               if (adapter->csb) { // XXX also csb_on ?
+                       adapter->csb->guest_need_txkick = 2; /* acked */
+                       // XXX memory barrier
+               }
+#endif /* NIC_PARAVIRT */
                 if (adapter->num_tx_desc_avail == adapter->num_tx_desc) {
                        adapter->watchdog_check = FALSE;
                        return;
@@ -3247,8 +3399,8 @@ lem_setup_receive_structures(struct adap
                        uint64_t paddr;
                        void *addr;
 
-                       addr = PNMB(slot + si, &paddr);
-                       netmap_load_map(adapter->rxtag, rx_buffer->map, addr);
+                       addr = PNMB(na, slot + si, &paddr);
+                       netmap_load_map(na, adapter->rxtag, rx_buffer->map, 
addr);
                        /* Update descriptor */
                        adapter->rx_desc_base[i].buffer_addr = htole64(paddr);
                        continue;
@@ -3445,7 +3597,23 @@ lem_rxeof(struct adapter *adapter, int c
        int             i, rx_sent = 0;
        struct e1000_rx_desc   *current_desc;
 
+#ifdef BATCH_DISPATCH
+       struct mbuf *mh = NULL, *mt = NULL;
+#endif /* BATCH_DISPATCH */
+#ifdef NIC_PARAVIRT
+       int retries = 0;
+       struct paravirt_csb* csb = adapter->csb;
+       int csb_mode = csb && csb->guest_csb_on;
+
+       //ND("clear guest_rxkick at %d", adapter->next_rx_desc_to_check);
+       if (csb_mode && csb->guest_need_rxkick)
+               csb->guest_need_rxkick = 0;
+#endif /* NIC_PARAVIRT */
        EM_RX_LOCK(adapter);
+
+#ifdef BATCH_DISPATCH
+    batch_again:
+#endif /* BATCH_DISPATCH */
        i = adapter->next_rx_desc_to_check;
        current_desc = &adapter->rx_desc_base[i];
        bus_dmamap_sync(adapter->rxdma.dma_tag, adapter->rxdma.dma_map,
@@ -3458,19 +3626,45 @@ lem_rxeof(struct adapter *adapter, int c
        }
 #endif /* DEV_NETMAP */
 
+#if 1 // XXX optimization ?
        if (!((current_desc->status) & E1000_RXD_STAT_DD)) {
                if (done != NULL)
                        *done = rx_sent;
                EM_RX_UNLOCK(adapter);
                return (FALSE);
        }
+#endif /* 0 */
 
        while (count != 0 && ifp->if_drv_flags & IFF_DRV_RUNNING) {
                struct mbuf *m = NULL;
 
                status = current_desc->status;
-               if ((status & E1000_RXD_STAT_DD) == 0)
+               if ((status & E1000_RXD_STAT_DD) == 0) {
+#ifdef NIC_PARAVIRT
+                   if (csb_mode) {
+                       /* buffer not ready yet. Retry a few times before 
giving up */
+                       if (++retries <= adapter->rx_retries) {
+                               continue;
+                       }
+                       if (csb->guest_need_rxkick == 0) {
+                               // ND("set guest_rxkick at %d", 
adapter->next_rx_desc_to_check);
+                               csb->guest_need_rxkick = 1;
+                               // XXX memory barrier, status volatile ?
+                               continue; /* double check */
+                       }
+                   }
+                   /* no buffer ready, give up */
+#endif /* NIC_PARAVIRT */
                        break;
+               }
+#ifdef NIC_PARAVIRT
+               if (csb_mode) {
+                       if (csb->guest_need_rxkick)
+                               // ND("clear again guest_rxkick at %d", 
adapter->next_rx_desc_to_check);
+                       csb->guest_need_rxkick = 0;
+                       retries = 0;
+               }
+#endif /* NIC_PARAVIRT */
 
                mp = adapter->rx_buffer_area[i].m_head;
                /*
@@ -3595,11 +3789,36 @@ discard:
                bus_dmamap_sync(adapter->rxdma.dma_tag, adapter->rxdma.dma_map,
                    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 
+#ifdef NIC_PARAVIRT
+               if (csb_mode) {
+                       /* the buffer at i has been already replaced by 
lem_get_buf()
+                        * so it is safe to set guest_rdt = i and possibly send 
a kick.
+                        * XXX see if we can optimize it later.
+                        */
+                       csb->guest_rdt = i;
+                       // XXX memory barrier
+                       if (i == csb->host_rxkick_at)
+                               E1000_WRITE_REG(&adapter->hw, E1000_RDT(0), i);
+               }
+#endif /* NIC_PARAVIRT */
                /* Advance our pointers to the next descriptor. */
                if (++i == adapter->num_rx_desc)
                        i = 0;
                /* Call into the stack */
                if (m != NULL) {
+#ifdef BATCH_DISPATCH
+                   if (adapter->batch_enable) {
+                       if (mh == NULL)
+                               mh = mt = m;
+                       else
+                               mt->m_nextpkt = m;
+                       mt = m;
+                       m->m_nextpkt = NULL;
+                       rx_sent++;
+                       current_desc = &adapter->rx_desc_base[i];
+                       continue;
+                   }
+#endif /* BATCH_DISPATCH */
                        adapter->next_rx_desc_to_check = i;
                        EM_RX_UNLOCK(adapter);
                        (*ifp->if_input)(ifp, m);
@@ -3610,10 +3829,27 @@ discard:
                current_desc = &adapter->rx_desc_base[i];
        }
        adapter->next_rx_desc_to_check = i;
+#ifdef BATCH_DISPATCH
+       if (mh) {
+               EM_RX_UNLOCK(adapter);
+               while ( (mt = mh) != NULL) {
+                       mh = mh->m_nextpkt;
+                       mt->m_nextpkt = NULL;
+                       if_input(ifp, mt);
+               }
+               EM_RX_LOCK(adapter);
+               i = adapter->next_rx_desc_to_check; /* in case of interrupts */
+               if (count > 0)
+                       goto batch_again;
+       }
+#endif /* BATCH_DISPATCH */
 
        /* Advance the E1000's Receive Queue #0  "Tail Pointer". */
        if (--i < 0)
                i = adapter->num_rx_desc - 1;
+#ifdef NIC_PARAVIRT
+       if (!csb_mode) /* filter out writes */
+#endif /* NIC_PARAVIRT */
        E1000_WRITE_REG(&adapter->hw, E1000_RDT(0), i);
        if (done != NULL)
                *done = rx_sent;

Modified: stable/10/sys/dev/ixgbe/ixgbe.c
==============================================================================
--- stable/10/sys/dev/ixgbe/ixgbe.c     Wed Aug 20 23:29:34 2014        
(r270251)
+++ stable/10/sys/dev/ixgbe/ixgbe.c     Wed Aug 20 23:34:36 2014        
(r270252)
@@ -3079,7 +3079,7 @@ ixgbe_setup_transmit_ring(struct tx_ring
                 */
                if (slot) {
                        int si = netmap_idx_n2k(&na->tx_rings[txr->me], i);
-                       netmap_load_map(txr->txtag, txbuf->map, NMB(slot + si));
+                       netmap_load_map(na, txr->txtag, txbuf->map, NMB(na, 
slot + si));
                }
 #endif /* DEV_NETMAP */
                /* Clear the EOP descriptor pointer */
@@ -4025,8 +4025,8 @@ ixgbe_setup_receive_ring(struct rx_ring 
                        uint64_t paddr;
                        void *addr;
 
-                       addr = PNMB(slot + sj, &paddr);
-                       netmap_load_map(rxr->ptag, rxbuf->pmap, addr);
+                       addr = PNMB(na, slot + sj, &paddr);
+                       netmap_load_map(na, rxr->ptag, rxbuf->pmap, addr);
                        /* Update descriptor and the cached value */
                        rxr->rx_base[j].read.pkt_addr = htole64(paddr);
                        rxbuf->addr = htole64(paddr);

Modified: stable/10/sys/dev/netmap/if_em_netmap.h
==============================================================================
--- stable/10/sys/dev/netmap/if_em_netmap.h     Wed Aug 20 23:29:34 2014        
(r270251)
+++ stable/10/sys/dev/netmap/if_em_netmap.h     Wed Aug 20 23:34:36 2014        
(r270252)
@@ -113,10 +113,10 @@ em_netmap_reg(struct netmap_adapter *na,
  * Reconcile kernel and user view of the transmit ring.
  */
 static int
-em_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
+em_netmap_txsync(struct netmap_kring *kring, int flags)
 {
+       struct netmap_adapter *na = kring->na;
        struct ifnet *ifp = na->ifp;
-       struct netmap_kring *kring = &na->tx_rings[ring_nr];
        struct netmap_ring *ring = kring->ring;
        u_int nm_i;     /* index into the netmap ring */
        u_int nic_i;    /* index into the NIC ring */
@@ -128,7 +128,7 @@ em_netmap_txsync(struct netmap_adapter *
 
        /* device-specific */
        struct adapter *adapter = ifp->if_softc;
-       struct tx_ring *txr = &adapter->tx_rings[ring_nr];
+       struct tx_ring *txr = &adapter->tx_rings[kring->ring_id];
 
        bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map,
                        BUS_DMASYNC_POSTREAD);
@@ -144,7 +144,7 @@ em_netmap_txsync(struct netmap_adapter *
                        struct netmap_slot *slot = &ring->slot[nm_i];
                        u_int len = slot->len;
                        uint64_t paddr;
-                       void *addr = PNMB(slot, &paddr);
+                       void *addr = PNMB(na, slot, &paddr);
 
                        /* device-specific */
                        struct e1000_tx_desc *curr = &txr->tx_base[nic_i];
@@ -153,12 +153,12 @@ em_netmap_txsync(struct netmap_adapter *
                                nic_i == 0 || nic_i == report_frequency) ?
                                E1000_TXD_CMD_RS : 0;
 
-                       NM_CHECK_ADDR_LEN(addr, len);
+                       NM_CHECK_ADDR_LEN(na, addr, len);
 
                        if (slot->flags & NS_BUF_CHANGED) {
                                curr->buffer_addr = htole64(paddr);
                                /* buffer has changed, reload map */
-                               netmap_reload_map(txr->txtag, txbuf->map, addr);
+                               netmap_reload_map(na, txr->txtag, txbuf->map, 
addr);
                        }
                        slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED);
 
@@ -187,7 +187,7 @@ em_netmap_txsync(struct netmap_adapter *
         */
        if (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) {
                /* record completed transmissions using TDH */
-               nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(ring_nr));
+               nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(kring->ring_id));
                if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */
                        D("TDH wrap %d", nic_i);
                        nic_i -= kring->nkr_num_slots;
@@ -208,10 +208,10 @@ em_netmap_txsync(struct netmap_adapter *
  * Reconcile kernel and user view of the receive ring.
  */
 static int
-em_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
+em_netmap_rxsync(struct netmap_kring *kring, int flags)
 {
+       struct netmap_adapter *na = kring->na;
        struct ifnet *ifp = na->ifp;
-       struct netmap_kring *kring = &na->rx_rings[ring_nr];
        struct netmap_ring *ring = kring->ring;
        u_int nm_i;     /* index into the netmap ring */
        u_int nic_i;    /* index into the NIC ring */
@@ -222,7 +222,7 @@ em_netmap_rxsync(struct netmap_adapter *
 
        /* device-specific */
        struct adapter *adapter = ifp->if_softc;
-       struct rx_ring *rxr = &adapter->rx_rings[ring_nr];
+       struct rx_ring *rxr = &adapter->rx_rings[kring->ring_id];
 
        if (head > lim)
                return netmap_ring_reinit(kring);
@@ -271,18 +271,18 @@ em_netmap_rxsync(struct netmap_adapter *
                for (n = 0; nm_i != head; n++) {
                        struct netmap_slot *slot = &ring->slot[nm_i];
                        uint64_t paddr;
-                       void *addr = PNMB(slot, &paddr);
+                       void *addr = PNMB(na, slot, &paddr);
 
                        struct e1000_rx_desc *curr = &rxr->rx_base[nic_i];
                        struct em_buffer *rxbuf = &rxr->rx_buffers[nic_i];
 
-                       if (addr == netmap_buffer_base) /* bad buf */
+                       if (addr == NETMAP_BUF_BASE(na)) /* bad buf */
                                goto ring_reset;
 
                        if (slot->flags & NS_BUF_CHANGED) {
                                /* buffer has changed, reload map */
                                curr->buffer_addr = htole64(paddr);
-                               netmap_reload_map(rxr->rxtag, rxbuf->map, addr);
+                               netmap_reload_map(na, rxr->rxtag, rxbuf->map, 
addr);
                                slot->flags &= ~NS_BUF_CHANGED;
                        }
                        curr->status = 0;

Modified: stable/10/sys/dev/netmap/if_igb_netmap.h
==============================================================================
--- stable/10/sys/dev/netmap/if_igb_netmap.h    Wed Aug 20 23:29:34 2014        
(r270251)
+++ stable/10/sys/dev/netmap/if_igb_netmap.h    Wed Aug 20 23:34:36 2014        
(r270252)
@@ -81,10 +81,10 @@ igb_netmap_reg(struct netmap_adapter *na
  * Reconcile kernel and user view of the transmit ring.
  */
 static int
-igb_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
+igb_netmap_txsync(struct netmap_kring *kring, int flags)
 {
+       struct netmap_adapter *na = kring->na;
        struct ifnet *ifp = na->ifp;
-       struct netmap_kring *kring = &na->tx_rings[ring_nr];
        struct netmap_ring *ring = kring->ring;
        u_int nm_i;     /* index into the netmap ring */
        u_int nic_i;    /* index into the NIC ring */
@@ -96,7 +96,7 @@ igb_netmap_txsync(struct netmap_adapter 
 
        /* device-specific */
        struct adapter *adapter = ifp->if_softc;
-       struct tx_ring *txr = &adapter->tx_rings[ring_nr];
+       struct tx_ring *txr = &adapter->tx_rings[kring->ring_id];
        /* 82575 needs the queue index added */
        u32 olinfo_status =
            (adapter->hw.mac.type == e1000_82575) ? (txr->me << 4) : 0;
@@ -115,7 +115,7 @@ igb_netmap_txsync(struct netmap_adapter 
                        struct netmap_slot *slot = &ring->slot[nm_i];
                        u_int len = slot->len;
                        uint64_t paddr;
-                       void *addr = PNMB(slot, &paddr);
+                       void *addr = PNMB(na, slot, &paddr);
 
                        /* device-specific */
                        union e1000_adv_tx_desc *curr =
@@ -125,11 +125,11 @@ igb_netmap_txsync(struct netmap_adapter 
                                nic_i == 0 || nic_i == report_frequency) ?
                                E1000_ADVTXD_DCMD_RS : 0;
 
-                       NM_CHECK_ADDR_LEN(addr, len);
+                       NM_CHECK_ADDR_LEN(na, addr, len);
 
                        if (slot->flags & NS_BUF_CHANGED) {
                                /* buffer has changed, reload map */
-                               netmap_reload_map(txr->txtag, txbuf->map, addr);
+                               netmap_reload_map(na, txr->txtag, txbuf->map, 
addr);
                        }
                        slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED);
 
@@ -171,7 +171,7 @@ igb_netmap_txsync(struct netmap_adapter 
         */
        if (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) {
                /* record completed transmissions using TDH */
-               nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(ring_nr));
+               nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(kring->ring_id));
                if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */
                        D("TDH wrap %d", nic_i);
                        nic_i -= kring->nkr_num_slots;
@@ -190,10 +190,10 @@ igb_netmap_txsync(struct netmap_adapter 
  * Reconcile kernel and user view of the receive ring.
  */
 static int
-igb_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
+igb_netmap_rxsync(struct netmap_kring *kring, int flags)
 {
+       struct netmap_adapter *na = kring->na;
        struct ifnet *ifp = na->ifp;
-       struct netmap_kring *kring = &na->rx_rings[ring_nr];
        struct netmap_ring *ring = kring->ring;
        u_int nm_i;     /* index into the netmap ring */
        u_int nic_i;    /* index into the NIC ring */
@@ -204,7 +204,7 @@ igb_netmap_rxsync(struct netmap_adapter 
 
        /* device-specific */
        struct adapter *adapter = ifp->if_softc;
-       struct rx_ring *rxr = &adapter->rx_rings[ring_nr];
+       struct rx_ring *rxr = &adapter->rx_rings[kring->ring_id];
 
        if (head > lim)
                return netmap_ring_reinit(kring);
@@ -251,17 +251,17 @@ igb_netmap_rxsync(struct netmap_adapter 
                for (n = 0; nm_i != head; n++) {
                        struct netmap_slot *slot = &ring->slot[nm_i];
                        uint64_t paddr;
-                       void *addr = PNMB(slot, &paddr);
+                       void *addr = PNMB(na, slot, &paddr);
 
                        union e1000_adv_rx_desc *curr = &rxr->rx_base[nic_i];
                        struct igb_rx_buf *rxbuf = &rxr->rx_buffers[nic_i];
 
-                       if (addr == netmap_buffer_base) /* bad buf */
+                       if (addr == NETMAP_BUF_BASE(na)) /* bad buf */
                                goto ring_reset;
 
                        if (slot->flags & NS_BUF_CHANGED) {
                                /* buffer has changed, reload map */
-                               netmap_reload_map(rxr->ptag, rxbuf->pmap, addr);
+                               netmap_reload_map(na, rxr->ptag, rxbuf->pmap, 
addr);
                                slot->flags &= ~NS_BUF_CHANGED;
                        }
                        curr->wb.upper.status_error = 0;

Modified: stable/10/sys/dev/netmap/if_lem_netmap.h
==============================================================================
--- stable/10/sys/dev/netmap/if_lem_netmap.h    Wed Aug 20 23:29:34 2014        
(r270251)
+++ stable/10/sys/dev/netmap/if_lem_netmap.h    Wed Aug 20 23:34:36 2014        
(r270252)
@@ -39,6 +39,7 @@
 #include <vm/pmap.h>    /* vtophys ? */
 #include <dev/netmap/netmap_kern.h>
 
+extern int netmap_adaptive_io;
 
 /*
  * Register/unregister. We are already under netmap lock.
@@ -84,10 +85,10 @@ lem_netmap_reg(struct netmap_adapter *na
  * Reconcile kernel and user view of the transmit ring.
  */
 static int
-lem_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
+lem_netmap_txsync(struct netmap_kring *kring, int flags)
 {
+       struct netmap_adapter *na = kring->na;
        struct ifnet *ifp = na->ifp;
-       struct netmap_kring *kring = &na->tx_rings[ring_nr];
        struct netmap_ring *ring = kring->ring;
        u_int nm_i;     /* index into the netmap ring */
        u_int nic_i;    /* index into the NIC ring */
@@ -98,6 +99,10 @@ lem_netmap_txsync(struct netmap_adapter 
 
        /* device-specific */
        struct adapter *adapter = ifp->if_softc;
+#ifdef NIC_PARAVIRT
+       struct paravirt_csb *csb = adapter->csb;
+       uint64_t *csbd = (uint64_t *)(csb + 1);
+#endif /* NIC_PARAVIRT */
 
        bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map,
                        BUS_DMASYNC_POSTREAD);
@@ -108,12 +113,25 @@ lem_netmap_txsync(struct netmap_adapter 
 
        nm_i = kring->nr_hwcur;
        if (nm_i != head) {     /* we have new packets to send */
+#ifdef NIC_PARAVIRT
+               int do_kick = 0;
+               uint64_t t = 0; // timestamp
+               int n = head - nm_i;
+               if (n < 0)
+                       n += lim + 1;
+               if (csb) {
+                       t = rdtsc(); /* last timestamp */
+                       csbd[16] += t - csbd[0]; /* total Wg */
+                       csbd[17] += n;          /* Wg count */
+                       csbd[0] = t;
+               }
+#endif /* NIC_PARAVIRT */
                nic_i = netmap_idx_k2n(kring, nm_i);
                while (nm_i != head) {
                        struct netmap_slot *slot = &ring->slot[nm_i];
                        u_int len = slot->len;
                        uint64_t paddr;
-                       void *addr = PNMB(slot, &paddr);
+                       void *addr = PNMB(na, slot, &paddr);
 
                        /* device-specific */
                        struct e1000_tx_desc *curr = 
&adapter->tx_desc_base[nic_i];
@@ -122,12 +140,12 @@ lem_netmap_txsync(struct netmap_adapter 
                                nic_i == 0 || nic_i == report_frequency) ?
                                E1000_TXD_CMD_RS : 0;
 
-                       NM_CHECK_ADDR_LEN(addr, len);
+                       NM_CHECK_ADDR_LEN(na, addr, len);
 
                        if (slot->flags & NS_BUF_CHANGED) {
                                /* buffer has changed, reload map */
                                curr->buffer_addr = htole64(paddr);
-                               netmap_reload_map(adapter->txtag, txbuf->map, 
addr);
+                               netmap_reload_map(na, adapter->txtag, 
txbuf->map, addr);
                        }
                        slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED);
 
@@ -140,6 +158,7 @@ lem_netmap_txsync(struct netmap_adapter 
 
                        nm_i = nm_next(nm_i, lim);
                        nic_i = nm_next(nic_i, lim);
+                       // XXX might try an early kick
                }
                kring->nr_hwcur = head;
 
@@ -147,8 +166,38 @@ lem_netmap_txsync(struct netmap_adapter 
                bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map,
                        BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 
+#ifdef NIC_PARAVIRT
+               /* set unconditionally, then also kick if needed */
+               if (csb) {
+                       t = rdtsc();
+                       if (csb->host_need_txkick == 2) {
+                               /* can compute an update of delta */
+                               int64_t delta = t - csbd[3];
+                               if (delta < 0)
+                                       delta = -delta;
+                               if (csbd[8] == 0 || delta < csbd[8]) {
+                                       csbd[8] = delta;
+                                       csbd[9]++;
+                               }
+                               csbd[10]++;
+                       }
+                       csb->guest_tdt = nic_i;
+                       csbd[18] += t - csbd[0]; // total wp
+                       csbd[19] += n;
+               }
+               if (!csb || !csb->guest_csb_on || (csb->host_need_txkick & 1))
+                       do_kick = 1;
+               if (do_kick)
+#endif /* NIC_PARAVIRT */
                /* (re)start the tx unit up to slot nic_i (excluded) */
                E1000_WRITE_REG(&adapter->hw, E1000_TDT(0), nic_i);
+#ifdef NIC_PARAVIRT
+               if (do_kick) {
+                       uint64_t t1 = rdtsc();
+                       csbd[20] += t1 - t; // total Np
+                       csbd[21]++;
+               }
+#endif /* NIC_PARAVIRT */
        }
 
        /*
@@ -157,6 +206,93 @@ lem_netmap_txsync(struct netmap_adapter 
        if (ticks != kring->last_reclaim || flags & NAF_FORCE_RECLAIM || 
nm_kr_txempty(kring)) {
                kring->last_reclaim = ticks;
                /* record completed transmissions using TDH */
+#ifdef NIC_PARAVIRT
+               /* host updates tdh unconditionally, and we have
+                * no side effects on reads, so we can read from there
+                * instead of exiting.
+                */
+               if (csb) {
+                   static int drain = 0, nodrain=0, good = 0, bad = 0, fail = 
0;
+                   u_int x = adapter->next_tx_to_clean;
+                   csbd[19]++; // XXX count reclaims
+                   nic_i = csb->host_tdh;
+                   if (csb->guest_csb_on) {
+                       if (nic_i == x) {
+                           bad++;
+                           csbd[24]++; // failed reclaims
+                           /* no progress, request kick and retry */
+                           csb->guest_need_txkick = 1;
+                           mb(); // XXX barrier
+                           nic_i = csb->host_tdh;
+                       } else {
+                           good++;
+                       }
+                       if (nic_i != x) {
+                           csb->guest_need_txkick = 2;
+                           if (nic_i == csb->guest_tdt)
+                               drain++;
+                           else
+                               nodrain++;
+#if 1
+                       if (netmap_adaptive_io) {
+                           /* new mechanism: last half ring (or so)
+                            * released one slot at a time.
+                            * This effectively makes the system spin.
+                            *
+                            * Take next_to_clean + 1 as a reference.
+                            * tdh must be ahead or equal
+                            * On entry, the logical order is
+                            *          x < tdh = nic_i
+                            * We first push tdh up to avoid wraps.
+                            * The limit is tdh-ll (half ring).
+                            * if tdh-256 < x we report x;
+                            * else we report tdh-256
+                            */
+                           u_int tdh = nic_i;
+                           u_int ll = csbd[15];
+                           u_int delta = lim/8;
+                           if (netmap_adaptive_io == 2 || ll > delta)
+                               csbd[15] = ll = delta;
+                           else if (netmap_adaptive_io == 1 && ll > 1) {
+                               csbd[15]--;
+                           }
+
+                           if (nic_i >= kring->nkr_num_slots) {
+                               RD(5, "bad nic_i %d on input", nic_i);
+                           }
+                           x = nm_next(x, lim);
+                           if (tdh < x)
+                               tdh += lim + 1;
+                           if (tdh <= x + ll) {
+                               nic_i = x;
+                               csbd[25]++; //report n + 1;
+                           } else {
+                               tdh = nic_i;
+                               if (tdh < ll)
+                                   tdh += lim + 1;
+                               nic_i = tdh - ll;
+                               csbd[26]++; // report tdh - ll
+                           }
+                       }
+#endif
+                       } else {
+                           /* we stop, count whether we are idle or not */
+                           int bh_active = csb->host_need_txkick & 2 ? 4 : 0;
+                           csbd[27+ csb->host_need_txkick]++;
+                           if (netmap_adaptive_io == 1) {
+                               if (bh_active && csbd[15] > 1)
+                                   csbd[15]--;
+                               else if (!bh_active && csbd[15] < lim/2)
+                                   csbd[15]++;
+                           }
+                           bad--;
+                           fail++;
+                       }
+                   }
+                   RD(1, "drain %d nodrain %d good %d retry %d fail %d",
+                       drain, nodrain, good, bad, fail);
+               } else
+#endif /* !NIC_PARAVIRT */
                nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(0));
                if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */
                        D("TDH wrap %d", nic_i);
@@ -176,10 +312,10 @@ lem_netmap_txsync(struct netmap_adapter 
  * Reconcile kernel and user view of the receive ring.
  */
 static int
-lem_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
+lem_netmap_rxsync(struct netmap_kring *kring, int flags)
 {
+       struct netmap_adapter *na = kring->na;
        struct ifnet *ifp = na->ifp;
-       struct netmap_kring *kring = &na->rx_rings[ring_nr];
        struct netmap_ring *ring = kring->ring;
        u_int nm_i;     /* index into the netmap ring */
        u_int nic_i;    /* index into the NIC ring */
@@ -190,10 +326,21 @@ lem_netmap_rxsync(struct netmap_adapter 
 
        /* device-specific */
        struct adapter *adapter = ifp->if_softc;
+#ifdef NIC_PARAVIRT
+       struct paravirt_csb *csb = adapter->csb;
+       uint32_t csb_mode = csb && csb->guest_csb_on;
+       uint32_t do_host_rxkick = 0;
+#endif /* NIC_PARAVIRT */
 
        if (head > lim)
                return netmap_ring_reinit(kring);
 
+#ifdef NIC_PARAVIRT
+       if (csb_mode) {
+               force_update = 1;
+               csb->guest_need_rxkick = 0;
+       }
+#endif /* NIC_PARAVIRT */
        /* XXX check sync modes */
        bus_dmamap_sync(adapter->rxdma.dma_tag, adapter->rxdma.dma_map,
                        BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
@@ -212,11 +359,28 @@ lem_netmap_rxsync(struct netmap_adapter 
                        uint32_t staterr = le32toh(curr->status);
                        int len;
 
+#ifdef NIC_PARAVIRT
+                       if (csb_mode) {
+                           if ((staterr & E1000_RXD_STAT_DD) == 0) {
+                               /* don't bother to retry if more than 1 pkt */
+                               if (n > 1)
+                                   break;
+                               csb->guest_need_rxkick = 1;
+                               wmb();
+                               staterr = le32toh(curr->status);
+                               if ((staterr & E1000_RXD_STAT_DD) == 0) {
+                                   break;
+                               } else { /* we are good */
+                                  csb->guest_need_rxkick = 0;
+                               }
+                           }
+                       } else
+#endif /* NIC_PARAVIRT */
                        if ((staterr & E1000_RXD_STAT_DD) == 0)
                                break;
                        len = le16toh(curr->length) - 4; // CRC
                        if (len < 0) {
-                               D("bogus pkt size %d nic idx %d", len, nic_i);
+                               RD(5, "bogus pkt (%d) size %d nic idx %d", n, 
len, nic_i);
                                len = 0;
                        }
                        ring->slot[nm_i].len = len;
@@ -228,6 +392,18 @@ lem_netmap_rxsync(struct netmap_adapter 
                        nic_i = nm_next(nic_i, lim);
                }
                if (n) { /* update the state variables */
+#ifdef NIC_PARAVIRT
+                       if (csb_mode) {
+                           if (n > 1) {
+                               /* leave one spare buffer so we avoid rxkicks */
+                               nm_i = nm_prev(nm_i, lim);
+                               nic_i = nm_prev(nic_i, lim);
+                               n--;
+                           } else {
+                               csb->guest_need_rxkick = 1;
+                           }
+                       }
+#endif /* NIC_PARAVIRT */
                        ND("%d new packets at nic %d nm %d tail %d",
                                n,
                                adapter->next_rx_desc_to_check,
@@ -249,23 +425,27 @@ lem_netmap_rxsync(struct netmap_adapter 
                for (n = 0; nm_i != head; n++) {
                        struct netmap_slot *slot = &ring->slot[nm_i];
                        uint64_t paddr;
-                       void *addr = PNMB(slot, &paddr);
+                       void *addr = PNMB(na, slot, &paddr);
 
                        struct e1000_rx_desc *curr = 
&adapter->rx_desc_base[nic_i];
                        struct em_buffer *rxbuf = 
&adapter->rx_buffer_area[nic_i];
 
-                       if (addr == netmap_buffer_base) /* bad buf */
+                       if (addr == NETMAP_BUF_BASE(na)) /* bad buf */
                                goto ring_reset;
 
                        if (slot->flags & NS_BUF_CHANGED) {
                                /* buffer has changed, reload map */
                                curr->buffer_addr = htole64(paddr);
-                               netmap_reload_map(adapter->rxtag, rxbuf->map, 
addr);
+                               netmap_reload_map(na, adapter->rxtag, 
rxbuf->map, addr);
                                slot->flags &= ~NS_BUF_CHANGED;
                        }
                        curr->status = 0;
                        bus_dmamap_sync(adapter->rxtag, rxbuf->map,
                            BUS_DMASYNC_PREREAD);
+#ifdef NIC_PARAVIRT
+                       if (csb_mode && csb->host_rxkick_at == nic_i)
+                               do_host_rxkick = 1;
+#endif /* NIC_PARAVIRT */

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
_______________________________________________
svn-src-all@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"

Reply via email to