Author: luigi
Date: Thu May 30 14:07:14 2013
New Revision: 251139
URL: http://svnweb.freebsd.org/changeset/base/251139

Log:
  Bring in a number of new features, mostly implemented by Michio Honda:
  
  - the VALE switch now support up to 254 destinations per switch,
    unicast or broadcast (multicast goes to all ports).
  
  - we can attach hw interfaces and the host stack to a VALE switch,
    which means we will be able to use it more or less as a native bridge
    (minor tweaks still necessary).
    A 'vale-ctl' program is supplied in tools/tools/netmap
    to attach/detach ports the switch, and list current configuration.
  
  - the lookup function in the VALE switch can be reassigned to
    something else, similar to the pf hooks. This will enable
    attaching the firewall, or other processing functions (e.g. in-kernel
    openvswitch) directly on the netmap port.
  
  The internal API used by device drivers does not change.
  
  Userspace applications should be recompiled because we
  bump NETMAP_API as we now use some fields in the struct nmreq
  that were previously ignored -- otherwise, data structures
  are the same.
  
  Manpages will be committed separately.

Added:
  head/tools/tools/netmap/vale-ctl.c   (contents, props changed)
Modified:
  head/sys/dev/netmap/netmap.c
  head/sys/dev/netmap/netmap_kern.h
  head/sys/net/netmap.h
  head/tools/tools/netmap/Makefile

Modified: head/sys/dev/netmap/netmap.c
==============================================================================
--- head/sys/dev/netmap/netmap.c        Thu May 30 13:41:19 2013        
(r251138)
+++ head/sys/dev/netmap/netmap.c        Thu May 30 14:07:14 2013        
(r251139)
@@ -119,6 +119,9 @@ SYSCTL_INT(_dev_netmap, OID_AUTO, mitiga
 int netmap_no_pendintr = 1;
 SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr,
     CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received 
packets.");
+int netmap_txsync_retry = 2;
+SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW,
+    &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush.");
 
 int netmap_drop = 0;   /* debugging */
 int netmap_flags = 0;  /* debug flags */
@@ -128,25 +131,30 @@ SYSCTL_INT(_dev_netmap, OID_AUTO, drop, 
 SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , "");
 SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , "");
 
-#ifdef NM_BRIDGE /* support for netmap bridge */
+#ifdef NM_BRIDGE /* support for netmap virtual switch, called VALE */
 
 /*
- * system parameters.
+ * system parameters (most of them in netmap_kern.h)
+ * NM_NAME     prefix for switch port names, default "vale"
+ * NM_MAXPORTS number of ports
+ * NM_BRIDGES  max number of switches in the system.
+ *     XXX should become a sysctl or tunable
  *
- * All switched ports have prefix NM_NAME.
- * The switch has a max of NM_BDG_MAXPORTS ports (often stored in a bitmap,
- * so a practical upper bound is 64).
- * Each tx ring is read-write, whereas rx rings are readonly (XXX not done 
yet).
+ * Switch ports are named valeX:Y where X is the switch name and Y
+ * is the port. If Y matches a physical interface name, the port is
+ * connected to a physical device.
+ *
+ * Unlike physical interfaces, switch ports use their own memory region
+ * for rings and buffers.
  * The virtual interfaces use per-queue lock instead of core lock.
  * In the tx loop, we aggregate traffic in batches to make all operations
  * faster. The batch size is NM_BDG_BATCH
  */
-#define        NM_NAME                 "vale"  /* prefix for the interface */
-#define NM_BDG_MAXPORTS                16      /* up to 64 ? */
+#define NM_BDG_MAXRINGS                16      /* XXX unclear how many. */
 #define NM_BRIDGE_RINGSIZE     1024    /* in the device */
 #define NM_BDG_HASH            1024    /* forwarding table entries */
 #define NM_BDG_BATCH           1024    /* entries in the forwarding buffer */
-#define        NM_BRIDGES              4       /* number of bridges */
+#define        NM_BRIDGES              8       /* number of bridges */
 
 
 int netmap_bridge = NM_BDG_BATCH; /* bridge batch size */
@@ -174,14 +182,27 @@ SYSCTL_INT(_dev_netmap, OID_AUTO, bridge
 #define        ADD_BDG_REF(ifp)        
refcount_acquire(&NA(ifp)->na_bdg_refcount)
 #define        DROP_BDG_REF(ifp)       
refcount_release(&NA(ifp)->na_bdg_refcount)
 
-static void bdg_netmap_attach(struct ifnet *ifp);
+static void bdg_netmap_attach(struct netmap_adapter *);
 static int bdg_netmap_reg(struct ifnet *ifp, int onoff);
+static int kern_netmap_regif(struct nmreq *nmr);
+
 /* per-tx-queue entry */
 struct nm_bdg_fwd {    /* forwarding entry for a bridge */
        void *buf;
-       uint64_t dst;   /* dst mask */
-       uint32_t src;   /* src index ? */
-       uint16_t len;   /* src len */
+       uint32_t ft_dst;        /* dst port */
+       uint16_t ft_len;        /* src len */
+       uint16_t ft_next;       /* next packet to same destination */
+};
+
+/* We need to build a list of buffers going to each destination.
+ * Each buffer is in one entry of struct nm_bdg_fwd, we use ft_next
+ * to build the list, and struct nm_bdg_q below for the queue.
+ * The structure should compact because potentially we have a lot
+ * of destinations.
+ */
+struct nm_bdg_q {
+       uint16_t bq_head;
+       uint16_t bq_tail;
 };
 
 struct nm_hash_ent {
@@ -198,26 +219,78 @@ struct nm_hash_ent {
  * The bridge is non blocking on the transmit ports.
  *
  * bdg_lock protects accesses to the bdg_ports array.
+ * This is a rw lock (or equivalent).
  */
 struct nm_bridge {
-       struct ifnet *bdg_ports[NM_BDG_MAXPORTS];
-       int n_ports;
-       uint64_t act_ports;
-       int freelist;   /* first buffer index */
-       NM_SELINFO_T si;        /* poll/select wait queue */
-       NM_LOCK_T bdg_lock;     /* protect the selinfo ? */
+       int namelen;    /* 0 means free */
 
-       /* the forwarding table, MAC+ports */
-       struct nm_hash_ent ht[NM_BDG_HASH];
+       /* XXX what is the proper alignment/layout ? */
+       NM_RWLOCK_T bdg_lock;   /* protects bdg_ports */
+       struct netmap_adapter *bdg_ports[NM_BDG_MAXPORTS];
 
-       int namelen;    /* 0 means free */
        char basename[IFNAMSIZ];
+       /*
+        * The function to decide the destination port.
+        * It returns either of an index of the destination port,
+        * NM_BDG_BROADCAST to broadcast this packet, or NM_BDG_NOPORT not to
+        * forward this packet.  ring_nr is the source ring index, and the
+        * function may overwrite this value to forward this packet to a
+        * different ring index.
+        * This function must be set by netmap_bdgctl().
+        */
+       bdg_lookup_fn_t nm_bdg_lookup;
+
+       /* the forwarding table, MAC+ports */
+       struct nm_hash_ent ht[NM_BDG_HASH];
 };
 
 struct nm_bridge nm_bridges[NM_BRIDGES];
+NM_LOCK_T      netmap_bridge_mutex;
 
-#define BDG_LOCK(b)    mtx_lock(&(b)->bdg_lock)
-#define BDG_UNLOCK(b)  mtx_unlock(&(b)->bdg_lock)
+/* other OS will have these macros defined in their own glue code. */
+
+#ifdef __FreeBSD__
+#define BDG_LOCK()             mtx_lock(&netmap_bridge_mutex)
+#define BDG_UNLOCK()           mtx_unlock(&netmap_bridge_mutex)
+#define BDG_WLOCK(b)           rw_wlock(&(b)->bdg_lock)
+#define BDG_WUNLOCK(b)         rw_wunlock(&(b)->bdg_lock)
+#define BDG_RLOCK(b)           rw_rlock(&(b)->bdg_lock)
+#define BDG_RUNLOCK(b)         rw_runlock(&(b)->bdg_lock)
+
+/* set/get variables. OS-specific macros may wrap these
+ * assignments into read/write lock or similar
+ */
+#define BDG_SET_VAR(lval, p)   (lval = p)
+#define BDG_GET_VAR(lval)      (lval)
+#define BDG_FREE(p)            free(p, M_DEVBUF)
+#endif /* __FreeBSD__ */
+
+static __inline int
+nma_is_vp(struct netmap_adapter *na)
+{
+       return na->nm_register == bdg_netmap_reg;
+}
+static __inline int
+nma_is_host(struct netmap_adapter *na)
+{
+       return na->nm_register == NULL;
+}
+static __inline int
+nma_is_hw(struct netmap_adapter *na)
+{
+       /* In case of sw adapter, nm_register is NULL */
+       return !nma_is_vp(na) && !nma_is_host(na);
+}
+
+/*
+ * Regarding holding a NIC, if the NIC is owned by the kernel
+ * (i.e., bridge), neither another bridge nor user can use it;
+ * if the NIC is owned by a user, only users can share it.
+ * Evaluation must be done under NMA_LOCK().
+ */
+#define NETMAP_OWNED_BY_KERN(ifp)      (!nma_is_vp(NA(ifp)) && NA(ifp)->na_bdg)
+#define NETMAP_OWNED_BY_ANY(ifp) \
+       (NETMAP_OWNED_BY_KERN(ifp) || (NA(ifp)->refcount > 0))
 
 /*
  * NA(ifp)->bdg_port   port index
@@ -245,15 +318,16 @@ pkt_copy(void *_src, void *_dst, int l)
         }
 }
 
+
 /*
  * locate a bridge among the existing ones.
  * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME.
  * We assume that this is called with a name of at least NM_NAME chars.
  */
 static struct nm_bridge *
-nm_find_bridge(const char *name)
+nm_find_bridge(const char *name, int create)
 {
-       int i, l, namelen, e;
+       int i, l, namelen;
        struct nm_bridge *b = NULL;
 
        namelen = strlen(NM_NAME);      /* base length */
@@ -268,29 +342,94 @@ nm_find_bridge(const char *name)
                namelen = IFNAMSIZ;
        ND("--- prefix is '%.*s' ---", namelen, name);
 
-       /* use the first entry for locking */
-       BDG_LOCK(nm_bridges); // XXX do better
-       for (e = -1, i = 1; i < NM_BRIDGES; i++) {
-               b = nm_bridges + i;
-               if (b->namelen == 0)
-                       e = i;  /* record empty slot */
-               else if (strncmp(name, b->basename, namelen) == 0) {
+       BDG_LOCK();
+       /* lookup the name, remember empty slot if there is one */
+       for (i = 0; i < NM_BRIDGES; i++) {
+               struct nm_bridge *x = nm_bridges + i;
+
+               if (x->namelen == 0) {
+                       if (create && b == NULL)
+                               b = x;  /* record empty slot */
+               } else if (x->namelen != namelen) {
+                       continue;
+               } else if (strncmp(name, x->basename, namelen) == 0) {
                        ND("found '%.*s' at %d", namelen, name, i);
+                       b = x;
                        break;
                }
        }
-       if (i == NM_BRIDGES) { /* all full */
-               if (e == -1) { /* no empty slot */
-                       b = NULL;
-               } else {
-                       b = nm_bridges + e;
-                       strncpy(b->basename, name, namelen);
-                       b->namelen = namelen;
-               }
+       if (i == NM_BRIDGES && b) { /* name not found, can create entry */
+               strncpy(b->basename, name, namelen);
+               b->namelen = namelen;
+               /* set the default function */
+               b->nm_bdg_lookup = netmap_bdg_learning;
+               /* reset the MAC address table */
+               bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH);
        }
-       BDG_UNLOCK(nm_bridges);
+       BDG_UNLOCK();
        return b;
 }
+
+
+/*
+ * Free the forwarding tables for rings attached to switch ports.
+ */
+static void
+nm_free_bdgfwd(struct netmap_adapter *na)
+{
+       int nrings, i;
+       struct netmap_kring *kring;
+
+       nrings = nma_is_vp(na) ? na->num_tx_rings : na->num_rx_rings;
+       kring = nma_is_vp(na) ? na->tx_rings : na->rx_rings;
+       for (i = 0; i < nrings; i++) {
+               if (kring[i].nkr_ft) {
+                       free(kring[i].nkr_ft, M_DEVBUF);
+                       kring[i].nkr_ft = NULL; /* protect from freeing twice */
+               }
+       }
+       if (nma_is_hw(na))
+               nm_free_bdgfwd(SWNA(na->ifp));
+}
+
+
+/*
+ * Allocate the forwarding tables for the rings attached to the bridge ports.
+ */
+static int
+nm_alloc_bdgfwd(struct netmap_adapter *na)
+{
+       int nrings, l, i, num_dstq;
+       struct netmap_kring *kring;
+
+       /* all port:rings + broadcast */
+       num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1;
+       l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH;
+       l += sizeof(struct nm_bdg_q) * num_dstq;
+       l += sizeof(uint16_t) * NM_BDG_BATCH;
+
+       nrings = nma_is_vp(na) ? na->num_tx_rings : na->num_rx_rings;
+       kring = nma_is_vp(na) ? na->tx_rings : na->rx_rings;
+       for (i = 0; i < nrings; i++) {
+               struct nm_bdg_fwd *ft;
+               struct nm_bdg_q *dstq;
+               int j;
+
+               ft = malloc(l, M_DEVBUF, M_NOWAIT | M_ZERO);
+               if (!ft) {
+                       nm_free_bdgfwd(na);
+                       return ENOMEM;
+               }
+               dstq = (struct nm_bdg_q *)(ft + NM_BDG_BATCH);
+               for (j = 0; j < num_dstq; j++)
+                       dstq[j].bq_head = dstq[j].bq_tail = NM_BDG_BATCH;
+               kring[i].nkr_ft = ft;
+       }
+       if (nma_is_hw(na))
+               nm_alloc_bdgfwd(SWNA(na->ifp));
+       return 0;
+}
+
 #endif /* NM_BRIDGE */
 
 
@@ -413,20 +552,11 @@ netmap_dtor_locked(void *data)
                if (netmap_verbose)
                        D("deleting last instance for %s", ifp->if_xname);
                /*
-                * there is a race here with *_netmap_task() and
-                * netmap_poll(), which don't run under NETMAP_REG_LOCK.
-                * na->refcount == 0 && na->ifp->if_capenable & IFCAP_NETMAP
-                * (aka NETMAP_DELETING(na)) are a unique marker that the
-                * device is dying.
-                * Before destroying stuff we sleep a bit, and then complete
-                * the job. NIOCREG should realize the condition and
-                * loop until they can continue; the other routines
-                * should check the condition at entry and quit if
-                * they cannot run.
+                * (TO CHECK) This function is only called
+                * when the last reference to this file descriptor goes
+                * away. This means we cannot have any pending poll()
+                * or interrupt routine operating on the structure.
                 */
-               na->nm_lock(ifp, NETMAP_REG_UNLOCK, 0);
-               tsleep(na, 0, "NIOCUNREG", 4);
-               na->nm_lock(ifp, NETMAP_REG_LOCK, 0);
                na->nm_register(ifp, 0); /* off, clear IFCAP_NETMAP */
                /* Wake up any sleeping threads. netmap_poll will
                 * then return POLLERR
@@ -437,6 +567,9 @@ netmap_dtor_locked(void *data)
                        selwakeuppri(&na->rx_rings[i].si, PI_NET);
                selwakeuppri(&na->tx_si, PI_NET);
                selwakeuppri(&na->rx_si, PI_NET);
+#ifdef NM_BRIDGE
+               nm_free_bdgfwd(na);
+#endif /* NM_BRIDGE */
                /* release all buffers */
                for (i = 0; i < na->num_tx_rings + 1; i++) {
                        struct netmap_ring *ring = na->tx_rings[i].ring;
@@ -458,49 +591,81 @@ netmap_dtor_locked(void *data)
                /* knlist_destroy(&na->tx_si.si_note); */
                /* knlist_destroy(&na->rx_si.si_note); */
                netmap_free_rings(na);
-               wakeup(na);
+               if (nma_is_hw(na))
+                       SWNA(ifp)->tx_rings = SWNA(ifp)->rx_rings = NULL;
        }
        netmap_if_free(nifp);
 }
 
+
+/* we assume netmap adapter exists */
 static void
 nm_if_rele(struct ifnet *ifp)
 {
 #ifndef NM_BRIDGE
        if_rele(ifp);
 #else /* NM_BRIDGE */
-       int i, full;
+       int i, full = 0, is_hw;
        struct nm_bridge *b;
+       struct netmap_adapter *na;
 
-       if (strncmp(ifp->if_xname, NM_NAME, sizeof(NM_NAME) - 1)) {
+       /* I can be called not only for get_ifp()-ed references where netmap's
+        * capability is guaranteed, but also for non-netmap-capable NICs.
+        */
+       if (!NETMAP_CAPABLE(ifp) || !NA(ifp)->na_bdg) {
                if_rele(ifp);
                return;
        }
        if (!DROP_BDG_REF(ifp))
                return;
-       b = ifp->if_bridge;
-       BDG_LOCK(nm_bridges);
-       BDG_LOCK(b);
+
+       na = NA(ifp);
+       b = na->na_bdg;
+       is_hw = nma_is_hw(na);
+
+       BDG_WLOCK(b);
        ND("want to disconnect %s from the bridge", ifp->if_xname);
        full = 0;
+       /* remove the entry from the bridge, also check
+        * if there are any leftover interfaces
+        * XXX we should optimize this code, e.g. going directly
+        * to na->bdg_port, and having a counter of ports that
+        * are connected. But it is not in a critical path.
+        * In NIC's case, index of sw na is always higher than hw na
+        */
        for (i = 0; i < NM_BDG_MAXPORTS; i++) {
-               if (b->bdg_ports[i] == ifp) {
-                       b->bdg_ports[i] = NULL;
-                       bzero(ifp, sizeof(*ifp));
-                       free(ifp, M_DEVBUF);
-                       break;
-               }
-               else if (b->bdg_ports[i] != NULL)
+               struct netmap_adapter *tmp = BDG_GET_VAR(b->bdg_ports[i]);
+
+               if (tmp == na) {
+                       /* disconnect from bridge */
+                       BDG_SET_VAR(b->bdg_ports[i], NULL);
+                       na->na_bdg = NULL;
+                       if (is_hw && SWNA(ifp)->na_bdg) {
+                               /* disconnect sw adapter too */
+                               int j = SWNA(ifp)->bdg_port;
+                               BDG_SET_VAR(b->bdg_ports[j], NULL);
+                               SWNA(ifp)->na_bdg = NULL;
+                       }
+               } else if (tmp != NULL) {
                        full = 1;
+               }
        }
-       BDG_UNLOCK(b);
+       BDG_WUNLOCK(b);
        if (full == 0) {
-               ND("freeing bridge %d", b - nm_bridges);
+               ND("marking bridge %d as free", b - nm_bridges);
                b->namelen = 0;
+               b->nm_bdg_lookup = NULL;
        }
-       BDG_UNLOCK(nm_bridges);
-       if (i == NM_BDG_MAXPORTS)
+       if (na->na_bdg) { /* still attached to the bridge */
                D("ouch, cannot find ifp to remove");
+       } else if (is_hw) {
+               if_rele(ifp);
+       } else {
+               bzero(na, sizeof(*na));
+               free(na, M_DEVBUF);
+               bzero(ifp, sizeof(*ifp));
+               free(ifp, M_DEVBUF);
+       }
 #endif /* NM_BRIDGE */
 }
 
@@ -514,9 +679,13 @@ netmap_dtor(void *data)
        if (ifp) {
                struct netmap_adapter *na = NA(ifp);
 
+               if (na->na_bdg)
+                       BDG_WLOCK(na->na_bdg);
                na->nm_lock(ifp, NETMAP_REG_LOCK, 0);
                netmap_dtor_locked(data);
                na->nm_lock(ifp, NETMAP_REG_UNLOCK, 0);
+               if (na->na_bdg)
+                       BDG_WUNLOCK(na->na_bdg);
 
                nm_if_rele(ifp); /* might also destroy *na */
        }
@@ -528,6 +697,7 @@ netmap_dtor(void *data)
        free(priv, M_DEVBUF);
 }
 
+
 #ifdef __FreeBSD__
 #include <vm/vm.h>
 #include <vm/vm_param.h>
@@ -536,8 +706,16 @@ netmap_dtor(void *data)
 #include <vm/vm_pager.h>
 #include <vm/uma.h>
 
+/*
+ * In order to track whether pages are still mapped, we hook into
+ * the standard cdev_pager and intercept the constructor and
+ * destructor.
+ * XXX but then ? Do we really use the information ?
+ * Need to investigate.
+ */
 static struct cdev_pager_ops saved_cdev_pager_ops;
 
+
 static int
 netmap_dev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot,
     vm_ooffset_t foff, struct ucred *cred, u_short *color)
@@ -548,6 +726,7 @@ netmap_dev_pager_ctor(void *handle, vm_o
                        size, prot, foff, cred, color);
 }
 
+
 static void
 netmap_dev_pager_dtor(void *handle)
 {
@@ -562,6 +741,8 @@ static struct cdev_pager_ops netmap_cdev
         .cdev_pg_fault = NULL,
 };
 
+
+// XXX check whether we need netmap_mmap_single _and_ netmap_mmap
 static int
 netmap_mmap_single(struct cdev *cdev, vm_ooffset_t *foff,
        vm_size_t objsize,  vm_object_t *objp, int prot)
@@ -630,6 +811,7 @@ netmap_mmap(__unused struct cdev *dev,
        return (*paddr ? 0 : ENOMEM);
 }
 
+
 static int
 netmap_close(struct cdev *dev, int fflag, int devtype, struct thread *td)
 {
@@ -639,6 +821,7 @@ netmap_close(struct cdev *dev, int fflag
        return 0;
 }
 
+
 static int
 netmap_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
 {
@@ -677,6 +860,7 @@ netmap_open(struct cdev *dev, int oflags
  * might take a while before releasing the buffer.
  */
 
+
 /*
  * pass a chain of buffers to the host stack as coming from 'dst'
  */
@@ -701,6 +885,7 @@ struct mbq {
        int count;
 };
 
+
 /*
  * put a copy of the buffers marked NS_FORWARD into an mbuf chain.
  * Run from hwcur to cur - reserved
@@ -745,6 +930,7 @@ netmap_grab_packets(struct netmap_kring 
        q->tail = tail;
 }
 
+
 /*
  * called under main lock to send packets from the host to the NIC
  * The host ring has packets from nr_hwcur to (cur - reserved)
@@ -794,6 +980,7 @@ netmap_sw_to_nic(struct netmap_adapter *
        }
 }
 
+
 /*
  * netmap_sync_to_host() passes packets up. We are called from a
  * system call in user process context, and the only contention
@@ -827,6 +1014,18 @@ netmap_sync_to_host(struct netmap_adapte
        netmap_send_up(na->ifp, q.head);
 }
 
+
+/* SWNA(ifp)->txrings[0] is always NA(ifp)->txrings[NA(ifp)->num_txrings] */
+static int
+netmap_bdg_to_host(struct ifnet *ifp, u_int ring_nr, int do_lock)
+{
+       (void)ring_nr;
+       (void)do_lock;
+       netmap_sync_to_host(NA(ifp));
+       return 0;
+}
+
+
 /*
  * rxsync backend for packets coming from the host stack.
  * They have been put in the queue by netmap_start() so we
@@ -881,38 +1080,60 @@ netmap_sync_from_host(struct netmap_adap
  * Return ENXIO if the interface does not exist, EINVAL if netmap
  * is not supported by the interface.
  * If successful, hold a reference.
+ *
+ * During the NIC is attached to a bridge, reference is managed
+ * at na->na_bdg_refcount using ADD/DROP_BDG_REF() as well as
+ * virtual ports.  Hence, on the final DROP_BDG_REF(), the NIC
+ * is detached from the bridge, then ifp's refcount is dropped (this
+ * is equivalent to that ifp is destroyed in case of virtual ports.
+ *
+ * This function uses if_rele() when we want to prevent the NIC from
+ * being detached from the bridge in error handling.  But once refcount
+ * is acquired by this function, it must be released using nm_if_rele().
  */
 static int
-get_ifp(const char *name, struct ifnet **ifp)
+get_ifp(struct nmreq *nmr, struct ifnet **ifp)
 {
+       const char *name = nmr->nr_name;
+       int namelen = strlen(name);
 #ifdef NM_BRIDGE
        struct ifnet *iter = NULL;
+       int no_prefix = 0;
 
        do {
                struct nm_bridge *b;
-               int i, l, cand = -1;
+               struct netmap_adapter *na;
+               int i, cand = -1, cand2 = -1;
 
-               if (strncmp(name, NM_NAME, sizeof(NM_NAME) - 1))
+               if (strncmp(name, NM_NAME, sizeof(NM_NAME) - 1)) {
+                       no_prefix = 1;
                        break;
-               b = nm_find_bridge(name);
+               }
+               b = nm_find_bridge(name, 1 /* create a new one if no exist */ );
                if (b == NULL) {
                        D("no bridges available for '%s'", name);
                        return (ENXIO);
                }
-               /* XXX locking */
-               BDG_LOCK(b);
+               /* Now we are sure that name starts with the bridge's name */
+               BDG_WLOCK(b);
                /* lookup in the local list of ports */
                for (i = 0; i < NM_BDG_MAXPORTS; i++) {
-                       iter = b->bdg_ports[i];
-                       if (iter == NULL) {
+                       na = BDG_GET_VAR(b->bdg_ports[i]);
+                       if (na == NULL) {
                                if (cand == -1)
                                        cand = i; /* potential insert point */
+                               else if (cand2 == -1)
+                                       cand2 = i; /* for host stack */
                                continue;
                        }
-                       if (!strcmp(iter->if_xname, name)) {
+                       iter = na->ifp;
+                       /* XXX make sure the name only contains one : */
+                       if (!strcmp(iter->if_xname, name) /* virtual port */ ||
+                           (namelen > b->namelen && !strcmp(iter->if_xname,
+                           name + b->namelen + 1)) /* NIC */) {
                                ADD_BDG_REF(iter);
                                ND("found existing interface");
-                               BDG_UNLOCK(b);
+                               BDG_WUNLOCK(b);
                                break;
                        }
                }
@@ -921,23 +1142,73 @@ get_ifp(const char *name, struct ifnet *
                if (cand == -1) {
                        D("bridge full, cannot create new port");
 no_port:
-                       BDG_UNLOCK(b);
+                       BDG_WUNLOCK(b);
                        *ifp = NULL;
                        return EINVAL;
                }
                ND("create new bridge port %s", name);
-               /* space for forwarding list after the ifnet */
-               l = sizeof(*iter) +
-                        sizeof(struct nm_bdg_fwd)*NM_BDG_BATCH ;
-               iter = malloc(l, M_DEVBUF, M_NOWAIT | M_ZERO);
-               if (!iter)
-                       goto no_port;
-               strcpy(iter->if_xname, name);
-               bdg_netmap_attach(iter);
-               b->bdg_ports[cand] = iter;
-               iter->if_bridge = b;
+               /*
+                * create a struct ifnet for the new port.
+                * The forwarding table is attached to the kring(s).
+                */
+               /*
+                * try see if there is a matching NIC with this name
+                * (after the bridge's name)
+                */
+               iter = ifunit_ref(name + b->namelen + 1);
+               if (!iter) { /* this is a virtual port */
+                       /* Create a temporary NA with arguments, then
+                        * bdg_netmap_attach() will allocate the real one
+                        * and attach it to the ifp
+                        */
+                       struct netmap_adapter tmp_na;
+
+                       if (nmr->nr_cmd) /* nr_cmd must be for a NIC */
+                               goto no_port;
+                       bzero(&tmp_na, sizeof(tmp_na));
+                       /* bound checking */
+                       if (nmr->nr_tx_rings < 1)
+                               nmr->nr_tx_rings = 1;
+                       if (nmr->nr_tx_rings > NM_BDG_MAXRINGS)
+                               nmr->nr_tx_rings = NM_BDG_MAXRINGS;
+                       tmp_na.num_tx_rings = nmr->nr_tx_rings;
+                       if (nmr->nr_rx_rings < 1)
+                               nmr->nr_rx_rings = 1;
+                       if (nmr->nr_rx_rings > NM_BDG_MAXRINGS)
+                               nmr->nr_rx_rings = NM_BDG_MAXRINGS;
+                       tmp_na.num_rx_rings = nmr->nr_rx_rings;
+
+                       iter = malloc(sizeof(*iter), M_DEVBUF, M_NOWAIT | 
M_ZERO);
+                       if (!iter)
+                               goto no_port;
+                       strcpy(iter->if_xname, name);
+                       tmp_na.ifp = iter;
+                       /* bdg_netmap_attach creates a struct netmap_adapter */
+                       bdg_netmap_attach(&tmp_na);
+               } else if (NETMAP_CAPABLE(iter)) { /* this is a NIC */
+                       /* cannot attach the NIC that any user or another
+                        * bridge already holds.
+                        */
+                       if (NETMAP_OWNED_BY_ANY(iter) || cand2 == -1) {
+ifunit_rele:
+                               if_rele(iter); /* don't detach from bridge */
+                               goto no_port;
+                       }
+                       /* bind the host stack to the bridge */
+                       if (nmr->nr_arg1 == NETMAP_BDG_HOST) {
+                               BDG_SET_VAR(b->bdg_ports[cand2], SWNA(iter));
+                               SWNA(iter)->bdg_port = cand2;
+                               SWNA(iter)->na_bdg = b;
+                       }
+               } else /* not a netmap-capable NIC */
+                       goto ifunit_rele;
+               na = NA(iter);
+               na->bdg_port = cand;
+               /* bind the port to the bridge (virtual ports are not active) */
+               BDG_SET_VAR(b->bdg_ports[cand], na);
+               na->na_bdg = b;
                ADD_BDG_REF(iter);
-               BDG_UNLOCK(b);
+               BDG_WUNLOCK(b);
                ND("attaching virtual bridge %p", b);
        } while (0);
        *ifp = iter;
@@ -949,8 +1220,16 @@ no_port:
        /* can do this if the capability exists and if_pspare[0]
         * points to the netmap descriptor.
         */
-       if (NETMAP_CAPABLE(*ifp))
+       if (NETMAP_CAPABLE(*ifp)) {
+#ifdef NM_BRIDGE
+               /* Users cannot use the NIC attached to a bridge directly */
+               if (no_prefix && NETMAP_OWNED_BY_KERN(*ifp)) {
+                       if_rele(*ifp); /* don't detach from bridge */
+                       return EINVAL;
+               } else
+#endif /* NM_BRIDGE */
                return 0;       /* valid pointer, we hold the refcount */
+       }
        nm_if_rele(*ifp);
        return EINVAL;  // not NETMAP capable
 }
@@ -1059,6 +1338,296 @@ netmap_set_ringid(struct netmap_priv_d *
        return 0;
 }
 
+
+/*
+ * possibly move the interface to netmap-mode.
+ * If success it returns a pointer to netmap_if, otherwise NULL.
+ * This must be called with NMA_LOCK held.
+ */
+static struct netmap_if *
+netmap_do_regif(struct netmap_priv_d *priv, struct ifnet *ifp,
+       uint16_t ringid, int *err)
+{
+       struct netmap_adapter *na = NA(ifp);
+       struct netmap_if *nifp = NULL;
+       int i, error;
+
+       if (na->na_bdg)
+               BDG_WLOCK(na->na_bdg);
+       na->nm_lock(ifp, NETMAP_REG_LOCK, 0);
+
+       /* ring configuration may have changed, fetch from the card */
+       netmap_update_config(na);
+       priv->np_ifp = ifp;     /* store the reference */
+       error = netmap_set_ringid(priv, ringid);
+       if (error)
+               goto out;
+       nifp = netmap_if_new(ifp->if_xname, na);
+       if (nifp == NULL) { /* allocation failed */
+               error = ENOMEM;
+       } else if (ifp->if_capenable & IFCAP_NETMAP) {
+               /* was already set */
+       } else {
+               /* Otherwise set the card in netmap mode
+                * and make it use the shared buffers.
+                */
+               for (i = 0 ; i < na->num_tx_rings + 1; i++)
+                       mtx_init(&na->tx_rings[i].q_lock, "nm_txq_lock",
+                           MTX_NETWORK_LOCK, MTX_DEF);
+               for (i = 0 ; i < na->num_rx_rings + 1; i++) {
+                       mtx_init(&na->rx_rings[i].q_lock, "nm_rxq_lock",
+                           MTX_NETWORK_LOCK, MTX_DEF);
+               }
+               if (nma_is_hw(na)) {
+                       SWNA(ifp)->tx_rings = &na->tx_rings[na->num_tx_rings];
+                       SWNA(ifp)->rx_rings = &na->rx_rings[na->num_rx_rings];
+               }
+               error = na->nm_register(ifp, 1); /* mode on */
+#ifdef NM_BRIDGE
+               if (!error)
+                       error = nm_alloc_bdgfwd(na);
+#endif /* NM_BRIDGE */
+               if (error) {
+                       netmap_dtor_locked(priv);
+                       /* nifp is not yet in priv, so free it separately */
+                       netmap_if_free(nifp);
+                       nifp = NULL;
+               }
+
+       }
+out:
+       *err = error;
+       na->nm_lock(ifp, NETMAP_REG_UNLOCK, 0);
+       if (na->na_bdg)
+               BDG_WUNLOCK(na->na_bdg);
+       return nifp;
+}
+
+
+/* Process NETMAP_BDG_ATTACH and NETMAP_BDG_DETACH */
+static int
+kern_netmap_regif(struct nmreq *nmr)
+{
+       struct ifnet *ifp;
+       struct netmap_if *nifp;
+       struct netmap_priv_d *npriv;
+       int error;
+
+       npriv = malloc(sizeof(*npriv), M_DEVBUF, M_NOWAIT|M_ZERO);
+       if (npriv == NULL)
+               return ENOMEM;
+       error = netmap_get_memory(npriv);
+       if (error) {
+free_exit:
+               bzero(npriv, sizeof(*npriv));
+               free(npriv, M_DEVBUF);
+               return error;
+       }
+
+       NMA_LOCK();
+       error = get_ifp(nmr, &ifp);
+       if (error) { /* no device, or another bridge or user owns the device */
+               NMA_UNLOCK();
+               goto free_exit;
+       } else if (!NETMAP_OWNED_BY_KERN(ifp)) {
+               /* got reference to a virtual port or direct access to a NIC.
+                * perhaps specified no bridge's prefix or wrong NIC's name
+                */
+               error = EINVAL;
+unref_exit:
+               nm_if_rele(ifp);
+               NMA_UNLOCK();
+               goto free_exit;
+       }
+
+       if (nmr->nr_cmd == NETMAP_BDG_DETACH) {
+               if (NA(ifp)->refcount == 0) { /* not registered */
+                       error = EINVAL;
+                       goto unref_exit;
+               }
+               NMA_UNLOCK();
+
+               netmap_dtor(NA(ifp)->na_kpriv); /* unregister */
+               NA(ifp)->na_kpriv = NULL;
+               nm_if_rele(ifp); /* detach from the bridge */
+               goto free_exit;
+       } else if (NA(ifp)->refcount > 0) { /* already registered */
+               error = EINVAL;
+               goto unref_exit;
+       }
+
+       nifp = netmap_do_regif(npriv, ifp, nmr->nr_ringid, &error);
+       if (!nifp)
+               goto unref_exit;
+       wmb(); // XXX do we need it ?
+       npriv->np_nifp = nifp;
+       NA(ifp)->na_kpriv = npriv;
+       NMA_UNLOCK();
+       D("registered %s to netmap-mode", ifp->if_xname);
+       return 0;
+}
+
+
+/* CORE_LOCK is not necessary */
+static void
+netmap_swlock_wrapper(struct ifnet *dev, int what, u_int queueid)
+{
+       struct netmap_adapter *na = SWNA(dev);
+
+       switch (what) {
+       case NETMAP_TX_LOCK:
+               mtx_lock(&na->tx_rings[queueid].q_lock);
+               break;
+
+       case NETMAP_TX_UNLOCK:
+               mtx_unlock(&na->tx_rings[queueid].q_lock);
+               break;
+
+       case NETMAP_RX_LOCK:
+               mtx_lock(&na->rx_rings[queueid].q_lock);
+               break;
+
+       case NETMAP_RX_UNLOCK:
+               mtx_unlock(&na->rx_rings[queueid].q_lock);
+               break;
+       }
+}
+
+
+/* Initialize necessary fields of sw adapter located in right after hw's
+ * one.  sw adapter attaches a pair of sw rings of the netmap-mode NIC.
+ * It is always activated and deactivated at the same tie with the hw's one.
+ * Thus we don't need refcounting on the sw adapter.
+ * Regardless of NIC's feature we use separate lock so that anybody can lock
+ * me independently from the hw adapter.
+ * Make sure nm_register is NULL to be handled as FALSE in nma_is_hw
+ */
+static void
+netmap_attach_sw(struct ifnet *ifp)
+{
+       struct netmap_adapter *hw_na = NA(ifp);
+       struct netmap_adapter *na = SWNA(ifp);
+
+       na->ifp = ifp;
+       na->separate_locks = 1;
+       na->nm_lock = netmap_swlock_wrapper;
+       na->num_rx_rings = na->num_tx_rings = 1;
+       na->num_tx_desc = hw_na->num_tx_desc;
+       na->num_rx_desc = hw_na->num_rx_desc;
+       na->nm_txsync = netmap_bdg_to_host;
+}
+
+
+/* exported to kernel callers */
+int
+netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func)
+{
+       struct nm_bridge *b;
+       struct netmap_adapter *na;
+       struct ifnet *iter;
+       char *name = nmr->nr_name;
+       int cmd = nmr->nr_cmd, namelen = strlen(name);
+       int error = 0, i, j;
+
+       switch (cmd) {
+       case NETMAP_BDG_ATTACH:
+       case NETMAP_BDG_DETACH:
+               error = kern_netmap_regif(nmr);
+               break;
+
+       case NETMAP_BDG_LIST:
+               /* this is used to enumerate bridges and ports */
+               if (namelen) { /* look up indexes of bridge and port */
+                       if (strncmp(name, NM_NAME, strlen(NM_NAME))) {
+                               error = EINVAL;
+                               break;
+                       }
+                       b = nm_find_bridge(name, 0 /* don't create */);
+                       if (!b) {
+                               error = ENOENT;
+                               break;
+                       }
+
+                       BDG_RLOCK(b);
+                       error = ENOENT;
+                       for (i = 0; i < NM_BDG_MAXPORTS; i++) {
+                               na = BDG_GET_VAR(b->bdg_ports[i]);
+                               if (na == NULL)
+                                       continue;
+                               iter = na->ifp;
+                               /* the former and the latter identify a
+                                * virtual port and a NIC, respectively
+                                */
+                               if (!strcmp(iter->if_xname, name) ||
+                                   (namelen > b->namelen &&
+                                   !strcmp(iter->if_xname,
+                                   name + b->namelen + 1))) {
+                                       /* bridge index */
+                                       nmr->nr_arg1 = b - nm_bridges;
+                                       nmr->nr_arg2 = i; /* port index */
+                                       error = 0;
+                                       break;
+                               }
+                       }
+                       BDG_RUNLOCK(b);
+               } else {
+                       /* return the first non-empty entry starting from
+                        * bridge nr_arg1 and port nr_arg2.
+                        *
+                        * Users can detect the end of the same bridge by
+                        * seeing the new and old value of nr_arg1, and can
+                        * detect the end of all the bridge by error != 0
+                        */
+                       i = nmr->nr_arg1;
+                       j = nmr->nr_arg2;
+
+                       for (error = ENOENT; error && i < NM_BRIDGES; i++) {
+                               b = nm_bridges + i;
+                               BDG_RLOCK(b);
+                               for (; j < NM_BDG_MAXPORTS; j++) {
+                                       na = BDG_GET_VAR(b->bdg_ports[j]);
+                                       if (na == NULL)
+                                               continue;
+                                       iter = na->ifp;
+                                       nmr->nr_arg1 = i;
+                                       nmr->nr_arg2 = j;
+                                       strncpy(name, iter->if_xname, IFNAMSIZ);
+                                       error = 0;
+                                       break;
+                               }
+                               BDG_RUNLOCK(b);
+                               j = 0; /* following bridges scan from 0 */
+                       }

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
_______________________________________________
svn-src-head@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"

Reply via email to