Author: sbruno
Date: Mon Apr 23 19:51:00 2018
New Revision: 332894
URL: https://svnweb.freebsd.org/changeset/base/332894

Log:
  Load balance sockets with new SO_REUSEPORT_LB option
  
  This patch adds a new socket option, SO_REUSEPORT_LB, which allow multiple
  programs or threads to bind to the same port and incoming connections will be
  load balanced using a hash function.
  
  Most of the code was copied from a similar patch for DragonflyBSD.
  
  However, in DragonflyBSD, load balancing is a global on/off setting and can 
not
  be set per socket. This patch allows for simultaneous use of both the current
  SO_REUSEPORT and the new SO_REUSEPORT_LB options on the same system.
  
  Required changes to structures
  Globally change so_options from 16 to 32 bit value to allow for more options.
  Add hashtable in pcbinfo to hold all SO_REUSEPORT_LB sockets.
  
  Limitations
  As DragonflyBSD, a load balance group is limited to 256 pcbs
  (256 programs or threads sharing the same socket).
  
  Submitted by: Johannes Lundberg <johanl...@gmail.com>
  Sponsored by: Limelight Networks
  Differential Revision:        https://reviews.freebsd.org/D11003

Modified:
  head/cddl/lib/libdtrace/tcp.d
  head/sys/kern/uipc_debug.c
  head/sys/kern/uipc_socket.c
  head/sys/netinet/in_pcb.c
  head/sys/netinet/in_pcb.h
  head/sys/netinet/ip_output.c
  head/sys/netinet/tcp_subr.c
  head/sys/netinet/udp_usrreq.c
  head/sys/netinet6/in6_pcb.c
  head/sys/netinet6/in6_src.c
  head/sys/netinet6/ip6_output.c
  head/sys/netinet6/udp6_usrreq.c
  head/sys/sys/socket.h
  head/sys/sys/socketvar.h

Modified: head/cddl/lib/libdtrace/tcp.d
==============================================================================
--- head/cddl/lib/libdtrace/tcp.d       Mon Apr 23 18:33:26 2018        
(r332893)
+++ head/cddl/lib/libdtrace/tcp.d       Mon Apr 23 19:51:00 2018        
(r332894)
@@ -192,12 +192,12 @@ translator tcpsinfo_t < struct tcpcb *p > {
        tcps_rport =            p == NULL ? 0 : 
ntohs(p->t_inpcb->inp_inc.inc_ie.ie_fport);
        tcps_laddr =            p == NULL ? 0 :
            p->t_inpcb->inp_vflag == INP_IPV4 ?
-           
inet_ntoa(&p->t_inpcb->inp_inc.inc_ie.ie_dependladdr.ie46_local.ia46_addr4.s_addr)
 :
-           inet_ntoa6(&p->t_inpcb->inp_inc.inc_ie.ie_dependladdr.ie6_local);
+           
inet_ntoa(&p->t_inpcb->inp_inc.inc_ie.ie_dependladdr.id46_addr.ia46_addr4.s_addr)
 :
+           inet_ntoa6(&p->t_inpcb->inp_inc.inc_ie.ie_dependladdr.id6_addr);
        tcps_raddr =            p == NULL ? 0 :
            p->t_inpcb->inp_vflag == INP_IPV4 ?
-           
inet_ntoa(&p->t_inpcb->inp_inc.inc_ie.ie_dependfaddr.ie46_foreign.ia46_addr4.s_addr)
 :
-           inet_ntoa6(&p->t_inpcb->inp_inc.inc_ie.ie_dependfaddr.ie6_foreign);
+           
inet_ntoa(&p->t_inpcb->inp_inc.inc_ie.ie_dependfaddr.id46_addr.ia46_addr4.s_addr)
 :
+           inet_ntoa6(&p->t_inpcb->inp_inc.inc_ie.ie_dependfaddr.id6_addr);
        tcps_state =            p == NULL ? -1 : p->t_state;
        tcps_iss =              p == NULL ? 0  : p->iss;
        tcps_irs =              p == NULL ? 0  : p->irs;

Modified: head/sys/kern/uipc_debug.c
==============================================================================
--- head/sys/kern/uipc_debug.c  Mon Apr 23 18:33:26 2018        (r332893)
+++ head/sys/kern/uipc_debug.c  Mon Apr 23 19:51:00 2018        (r332894)
@@ -77,7 +77,7 @@ db_print_sotype(short so_type)
 }
 
 static void
-db_print_sooptions(short so_options)
+db_print_sooptions(int so_options)
 {
        int comma;
 
@@ -120,6 +120,10 @@ db_print_sooptions(short so_options)
        }
        if (so_options & SO_REUSEPORT) {
                db_printf("%sSO_REUSEPORT", comma ? ", " : "");
+               comma = 1;
+       }
+       if (so_options & SO_REUSEPORT_LB) {
+               db_printf("%sSO_REUSEPORT_LB", comma ? ", " : "");
                comma = 1;
        }
        if (so_options & SO_TIMESTAMP) {

Modified: head/sys/kern/uipc_socket.c
==============================================================================
--- head/sys/kern/uipc_socket.c Mon Apr 23 18:33:26 2018        (r332893)
+++ head/sys/kern/uipc_socket.c Mon Apr 23 19:51:00 2018        (r332894)
@@ -1057,6 +1057,100 @@ sofree(struct socket *so)
 }
 
 /*
+ * Let socket in same load balance group (same port and address)
+ * inherit pending sockets of the closing socket.
+ *
+ * "so_inh" will inherit sockets from "so"
+ */
+void
+soinherit(struct socket *so, struct socket *so_inh)
+{
+       TAILQ_HEAD(, socket) comp, incomp;
+       struct socket *sp, *head, *head_inh;
+       int qlen, incqlen;
+
+       KASSERT(so->so_options & SO_ACCEPTCONN,
+           ("so does not accept connection"));
+       KASSERT(so_inh->so_options & SO_ACCEPTCONN,
+           ("so_inh does not accept connection"));
+
+
+restart:
+       SOCK_LOCK(so);
+       if ((head = so->so_listen) != NULL &&
+           __predict_false(SOLISTEN_TRYLOCK(head) == 0)) {
+               SOCK_UNLOCK(so);
+               goto restart;
+       }
+
+restart_inh:
+       SOCK_LOCK(so_inh);
+       if ((head_inh = so_inh->so_listen) != NULL &&
+           __predict_false(SOLISTEN_TRYLOCK(head_inh) == 0)) {
+               SOCK_UNLOCK(so_inh);
+               goto restart_inh;
+       }
+
+       TAILQ_INIT(&comp);
+       TAILQ_INIT(&incomp);
+
+       /*
+        * Save completed queue and incompleted queue
+        */
+       TAILQ_CONCAT(&comp, &so->sol_comp, so_list);
+       qlen = so->sol_qlen;
+       so->sol_qlen = 0;
+
+       TAILQ_CONCAT(&incomp, &so->sol_incomp, so_list);
+       incqlen = so->sol_incqlen;
+       so->sol_incqlen = 0;
+
+       /*
+        * Append the saved completed queue and incompleted
+        * queue to the socket inherits them.
+        *
+        * XXX
+        * This may temporarily break the inheriting socket's
+        * so_qlimit.
+        */
+       TAILQ_FOREACH(sp, &comp, so_list) {
+               refcount_acquire(&so_inh->so_count);
+               sp->so_listen = so_inh;
+               crfree(sp->so_cred);
+               sp->so_cred = crhold(so_inh->so_cred);
+       }
+
+       TAILQ_FOREACH(sp, &incomp, so_list) {
+               refcount_acquire(&so_inh->so_count);
+               sp->so_listen = so_inh;
+               crfree(sp->so_cred);
+               sp->so_cred = crhold(so_inh->so_cred);
+       }
+
+       TAILQ_CONCAT(&so_inh->sol_comp, &comp, so_list);
+       so_inh->sol_qlen += qlen;
+
+       TAILQ_CONCAT(&so_inh->sol_incomp, &incomp, so_list);
+       so_inh->sol_incqlen += incqlen;
+
+       SOCK_UNLOCK(so);
+       if(head != NULL)
+               SOLISTEN_UNLOCK(head);
+
+       SOCK_UNLOCK(so_inh);
+       if(head_inh != NULL) {
+               if(qlen > 0) {
+                       /*
+                        * "New" connections have arrived
+                        */
+                       solisten_wakeup(head_inh);
+               } else {
+                       SOLISTEN_UNLOCK(head_inh);
+               }
+       }
+}
+
+/*
  * Close a socket on last file table reference removal.  Initiate disconnect
  * if connected.  Free socket when disconnect complete.
  *
@@ -2776,6 +2870,7 @@ sosetopt(struct socket *so, struct sockopt *sopt)
                case SO_BROADCAST:
                case SO_REUSEADDR:
                case SO_REUSEPORT:
+               case SO_REUSEPORT_LB:
                case SO_OOBINLINE:
                case SO_TIMESTAMP:
                case SO_BINTIME:
@@ -2994,6 +3089,7 @@ sogetopt(struct socket *so, struct sockopt *sopt)
                case SO_KEEPALIVE:
                case SO_REUSEADDR:
                case SO_REUSEPORT:
+               case SO_REUSEPORT_LB:
                case SO_BROADCAST:
                case SO_OOBINLINE:
                case SO_ACCEPTCONN:

Modified: head/sys/netinet/in_pcb.c
==============================================================================
--- head/sys/netinet/in_pcb.c   Mon Apr 23 18:33:26 2018        (r332893)
+++ head/sys/netinet/in_pcb.c   Mon Apr 23 19:51:00 2018        (r332894)
@@ -108,6 +108,9 @@ __FBSDID("$FreeBSD$");
 
 #include <security/mac/mac_framework.h>
 
+#define INPCBLBGROUP_SIZMIN    8
+#define INPCBLBGROUP_SIZMAX    256
+
 static struct callout  ipport_tick_callout;
 
 /*
@@ -217,7 +220,186 @@ SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomtim
  * functions often modify hash chains or addresses in pcbs.
  */
 
+static struct inpcblbgroup *
+in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, u_char vflag,
+    uint16_t port, const union in_dependaddr *addr, int size)
+{
+       struct inpcblbgroup *grp;
+
+       size_t bytes = __offsetof(struct inpcblbgroup, il_inp[size]);
+       grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT);
+       if(!grp)
+               return NULL;
+       grp->il_vflag = vflag;
+       grp->il_lport = port;
+       grp->il_dependladdr = *addr;
+       grp->il_inpsiz = size;
+       LIST_INSERT_HEAD(hdr, grp, il_list);
+
+       return grp;
+}
+
+static void
+in_pcblbgroup_free(struct inpcblbgroup *grp)
+{
+       LIST_REMOVE(grp, il_list);
+       free(grp, M_TEMP);
+}
+
+static struct inpcblbgroup *
+in_pcblbgroup_resize(struct inpcblbgrouphead *hdr,
+    struct inpcblbgroup *old_grp, int size)
+{
+       struct inpcblbgroup *grp;
+       int i;
+
+       grp = in_pcblbgroup_alloc(hdr, old_grp->il_vflag,
+           old_grp->il_lport, &old_grp->il_dependladdr, size);
+       if(!grp)
+               return NULL;
+
+       KASSERT(old_grp->il_inpcnt < grp->il_inpsiz,
+           ("invalid new local group size %d and old local group count %d",
+            grp->il_inpsiz, old_grp->il_inpcnt));
+       for (i = 0; i < old_grp->il_inpcnt; ++i)
+               grp->il_inp[i] = old_grp->il_inp[i];
+       grp->il_inpcnt = old_grp->il_inpcnt;
+
+       in_pcblbgroup_free(old_grp);
+
+       return grp;
+}
+
 /*
+ * Add PCB to lb group (load balance used by SO_REUSEPORT_LB)
+ */
+static int
+in_pcbinslbgrouphash(struct inpcb *inp, struct inpcbinfo *pcbinfo)
+{
+       struct inpcblbgrouphead *hdr;
+       struct inpcblbgroup *grp;
+
+       uint16_t hashmask = pcbinfo->ipi_lbgrouphashmask;
+       uint16_t lport = inp->inp_lport;
+       uint32_t group_index = INP_PCBLBGROUP_PORTHASH(lport, hashmask);
+
+       hdr = &pcbinfo->ipi_lbgrouphashbase[group_index];
+
+       struct ucred *cred;
+
+       if (pcbinfo->ipi_lbgrouphashbase == NULL)
+               return 0;
+
+       /*
+        * don't allow jailed socket to join local group
+        */
+       if (inp->inp_socket != NULL)
+               cred = inp->inp_socket->so_cred;
+       else
+               cred = NULL;
+       if (cred != NULL && jailed(cred))
+               return 0;
+
+#ifdef INET6
+       /*
+        * don't allow IPv4 mapped INET6 wild socket
+        */
+       if ((inp->inp_vflag & INP_IPV4) &&
+           inp->inp_laddr.s_addr == INADDR_ANY &&
+           INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) {
+               return 0;
+       }
+#endif
+
+       hdr = &pcbinfo->ipi_lbgrouphashbase[
+           INP_PCBLBGROUP_PORTHASH(inp->inp_lport, 
pcbinfo->ipi_lbgrouphashmask)];
+
+       LIST_FOREACH(grp, hdr, il_list) {
+               if (grp->il_vflag == inp->inp_vflag &&
+                   grp->il_lport == inp->inp_lport &&
+                   memcmp(&grp->il_dependladdr,
+                       &inp->inp_inc.inc_ie.ie_dependladdr,
+                       sizeof(grp->il_dependladdr)) == 0) {
+                       break;
+               }
+       }
+       if (grp == NULL) {
+               /* Create new load balance group */
+               grp = in_pcblbgroup_alloc(hdr, inp->inp_vflag,
+                   inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr,
+                   INPCBLBGROUP_SIZMIN);
+               if(!grp)
+                       return (ENOBUFS);
+       } else if (grp->il_inpcnt == grp->il_inpsiz) {
+               if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) {
+                       static int limit_logged = 0;
+
+                       if (!limit_logged) {
+                               limit_logged = 1;
+                               printf("lb group port %d, "
+                                          "limit reached\n", 
ntohs(grp->il_lport));
+                       }
+                       return 0;
+               }
+
+               /* Expand this local group */
+               grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2);
+               if(!grp)
+                       return (ENOBUFS);
+       }
+
+       KASSERT(grp->il_inpcnt < grp->il_inpsiz,
+                       ("invalid local group size %d and count %d",
+                        grp->il_inpsiz, grp->il_inpcnt));
+
+       grp->il_inp[grp->il_inpcnt] = inp;
+       grp->il_inpcnt++;
+       return 0;
+}
+
+static void
+in_pcbremlbgrouphash(struct inpcb *inp, struct inpcbinfo *pcbinfo)
+{
+       struct inpcblbgrouphead *hdr;
+       struct inpcblbgroup *grp;
+
+       if (pcbinfo->ipi_lbgrouphashbase == NULL)
+               return;
+
+       hdr = &pcbinfo->ipi_lbgrouphashbase[
+           INP_PCBLBGROUP_PORTHASH(inp->inp_lport, 
pcbinfo->ipi_lbgrouphashmask)];
+
+       LIST_FOREACH(grp, hdr, il_list) {
+               int i;
+
+               for (i = 0; i < grp->il_inpcnt; ++i) {
+                       if (grp->il_inp[i] != inp)
+                               continue;
+
+                       if (grp->il_inpcnt == 1) {
+                               /* Free this local group */
+                               in_pcblbgroup_free(grp);
+                       } else {
+                               /* Pull up inpcbs */
+                               for (; i + 1 < grp->il_inpcnt; ++i)
+                                       grp->il_inp[i] = grp->il_inp[i + 1];
+                               grp->il_inpcnt--;
+
+                               if (grp->il_inpsiz > INPCBLBGROUP_SIZMIN &&
+                                   grp->il_inpcnt <= (grp->il_inpsiz / 4)) {
+                                       /* Shrink this local group */
+                                       struct inpcblbgroup *new_grp =
+                                               in_pcblbgroup_resize(hdr, grp, 
grp->il_inpsiz / 2);
+                                       if(new_grp)
+                                               grp = new_grp;
+                               }
+                       }
+                       return;
+               }
+       }
+}
+
+/*
  * Different protocols initialize their inpcbs differently - giving
  * different name to the lock.  But they all are disposed the same.
  */
@@ -252,6 +434,8 @@ in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char 
            &pcbinfo->ipi_hashmask);
        pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB,
            &pcbinfo->ipi_porthashmask);
+       pcbinfo->ipi_lbgrouphashbase = hashinit(hash_nelements, M_PCB,
+           &pcbinfo->ipi_lbgrouphashmask);
 #ifdef PCBGROUP
        in_pcbgroup_init(pcbinfo, hashfields, hash_nelements);
 #endif
@@ -275,6 +459,8 @@ in_pcbinfo_destroy(struct inpcbinfo *pcbinfo)
        hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask);
        hashdestroy(pcbinfo->ipi_porthashbase, M_PCB,
            pcbinfo->ipi_porthashmask);
+       hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB,
+           pcbinfo->ipi_lbgrouphashmask);
 #ifdef PCBGROUP
        in_pcbgroup_destroy(pcbinfo);
 #endif
@@ -513,18 +699,20 @@ in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp
 /*
  * Return cached socket options.
  */
-short
+int
 inp_so_options(const struct inpcb *inp)
 {
-   short so_options;
+       int so_options;
 
-   so_options = 0;
+       so_options = 0;
 
-   if ((inp->inp_flags2 & INP_REUSEPORT) != 0)
-          so_options |= SO_REUSEPORT;
-   if ((inp->inp_flags2 & INP_REUSEADDR) != 0)
-          so_options |= SO_REUSEADDR;
-   return (so_options);
+       if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0)
+               so_options |= SO_REUSEPORT_LB;
+       if ((inp->inp_flags2 & INP_REUSEPORT) != 0)
+               so_options |= SO_REUSEPORT;
+       if ((inp->inp_flags2 & INP_REUSEADDR) != 0)
+               so_options |= SO_REUSEADDR;
+       return (so_options);
 }
 #endif /* INET || INET6 */
 
@@ -581,6 +769,12 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *n
        int error;
 
        /*
+        * XXX Maybe we could let SO_REUSEPORT_LB set SO_REUSEPORT bit here
+        * so that we don't have to add to the (already messy) code below
+        */
+       int reuseport_lb = (so->so_options & SO_REUSEPORT_LB);
+
+       /*
         * No state changes, so read locks are sufficient here.
         */
        INP_LOCK_ASSERT(inp);
@@ -591,7 +785,7 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *n
        laddr.s_addr = *laddrp;
        if (nam != NULL && laddr.s_addr != INADDR_ANY)
                return (EINVAL);
-       if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0)
+       if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0)
                lookupflags = INPLOOKUP_WILDCARD;
        if (nam == NULL) {
                if ((error = prison_local_ip4(cred, &laddr)) != 0)
@@ -628,16 +822,20 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *n
                         */
                        if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0)
                                reuseport = SO_REUSEADDR|SO_REUSEPORT;
+                       // XXX: How to deal with SO_REUSEPORT_LB here?
+                       // Added equivalent treatment as SO_REUSEPORT here for 
now
+                       if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT_LB)) 
!= 0)
+                               reuseport_lb = SO_REUSEADDR|SO_REUSEPORT_LB;
                } else if (sin->sin_addr.s_addr != INADDR_ANY) {
                        sin->sin_port = 0;              /* yech... */
                        bzero(&sin->sin_zero, sizeof(sin->sin_zero));
                        /*
-                        * Is the address a local IP address? 
+                        * Is the address a local IP address?
                         * If INP_BINDANY is set, then the socket may be bound
                         * to any endpoint address, local or not.
                         */
                        if ((inp->inp_flags & INP_BINDANY) == 0 &&
-                           ifa_ifwithaddr_check((struct sockaddr *)sin) == 0) 
+                           ifa_ifwithaddr_check((struct sockaddr *)sin) == 0)
                                return (EADDRNOTAVAIL);
                }
                laddr = sin->sin_addr;
@@ -667,7 +865,8 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *n
                                     ntohl(t->inp_faddr.s_addr) == INADDR_ANY) 
&&
                                    (ntohl(sin->sin_addr.s_addr) != INADDR_ANY 
||
                                     ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
-                                    (t->inp_flags2 & INP_REUSEPORT) == 0) &&
+                                    (t->inp_flags2 & INP_REUSEPORT) ||
+                                    (t->inp_flags2 & INP_REUSEPORT_LB) == 0) &&
                                    (inp->inp_cred->cr_uid !=
                                     t->inp_cred->cr_uid))
                                        return (EADDRINUSE);
@@ -692,11 +891,14 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *n
                                 */
                                tw = intotw(t);
                                if (tw == NULL ||
-                                   (reuseport & tw->tw_so_options) == 0)
+                                   ((reuseport & tw->tw_so_options) == 0 &&
+                                       (reuseport_lb & tw->tw_so_options) == 
0)) {
                                        return (EADDRINUSE);
+                               }
                        } else if (t &&
-                           ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
-                           (reuseport & inp_so_options(t)) == 0) {
+                                  ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
+                                  (reuseport & inp_so_options(t)) == 0 &&
+                                  (reuseport_lb & inp_so_options(t)) == 0) {
 #ifdef INET6
                                if (ntohl(sin->sin_addr.s_addr) !=
                                    INADDR_ANY ||
@@ -705,7 +907,7 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *n
                                    (inp->inp_vflag & INP_IPV6PROTO) == 0 ||
                                    (t->inp_vflag & INP_IPV6PROTO) == 0)
 #endif
-                               return (EADDRINUSE);
+                                               return (EADDRINUSE);
                                if (t && (! in_pcbbind_check_bindmulti(inp, t)))
                                        return (EADDRINUSE);
                        }
@@ -1409,6 +1611,7 @@ in_pcbdrop(struct inpcb *inp)
                struct inpcbport *phd = inp->inp_phd;
 
                INP_HASH_WLOCK(inp->inp_pcbinfo);
+               in_pcbremlbgrouphash(inp, inp->inp_pcbinfo);
                LIST_REMOVE(inp, inp_hash);
                LIST_REMOVE(inp, inp_portlist);
                if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
@@ -1669,6 +1872,98 @@ in_pcblookup_local(struct inpcbinfo *pcbinfo, struct i
 }
 #undef INP_LOOKUP_MAPPED_PCB_COST
 
+struct inpcb *
+in_pcblookup_lbgroup_last(const struct inpcb *inp)
+{
+       const struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
+       const struct inpcblbgrouphead *hdr;
+       const struct inpcblbgroup *grp;
+       int i;
+
+       if (pcbinfo->ipi_lbgrouphashbase == NULL)
+               return NULL;
+
+       hdr = &pcbinfo->ipi_lbgrouphashbase[
+           INP_PCBLBGROUP_PORTHASH(inp->inp_lport, 
pcbinfo->ipi_lbgrouphashmask)];
+
+       LIST_FOREACH(grp, hdr, il_list) {
+               if (grp->il_vflag == inp->inp_vflag &&
+                   grp->il_lport == inp->inp_lport &&
+                   memcmp(&grp->il_dependladdr,
+                       &inp->inp_inc.inc_ie.ie_dependladdr,
+                       sizeof(grp->il_dependladdr)) == 0) {
+                       break;
+               }
+       }
+       if (grp == NULL || grp->il_inpcnt == 1)
+               return NULL;
+
+       KASSERT(grp->il_inpcnt >= 2,
+           ("invalid lbgroup inp count %d", grp->il_inpcnt));
+       for (i = 0; i < grp->il_inpcnt; ++i) {
+               if (grp->il_inp[i] == inp) {
+                       int last = grp->il_inpcnt - 1;
+
+                       if (i == last)
+                               last = grp->il_inpcnt - 2;
+                       return grp->il_inp[last];
+               }
+       }
+       return NULL;
+}
+
+static struct inpcb *
+in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
+  const struct in_addr *laddr, uint16_t lport, const struct in_addr *faddr,
+  uint16_t fport, int lookupflags)
+{
+       struct inpcb *local_wild = NULL;
+       const struct inpcblbgrouphead *hdr;
+       struct inpcblbgroup *grp;
+       struct inpcblbgroup *grp_local_wild;
+
+       hdr = &pcbinfo->ipi_lbgrouphashbase[
+                 INP_PCBLBGROUP_PORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)];
+
+       /*
+        * Order of socket selection:
+        * 1. non-wild.
+        * 2. wild (if lookupflags contains INPLOOKUP_WILDCARD).
+        *
+        * NOTE:
+        * - Load balanced group does not contain jailed sockets
+        * - Load balanced group does not contain IPv4 mapped INET6 wild sockets
+        */
+       LIST_FOREACH(grp, hdr, il_list) {
+#ifdef INET6
+               if (!(grp->il_vflag & INP_IPV4))
+                       continue;
+#endif
+
+               if (grp->il_lport == lport) {
+
+                       uint32_t idx = 0;
+                       int pkt_hash = INP_PCBLBGROUP_PKTHASH(faddr->s_addr, 
lport, fport);
+
+                       idx = pkt_hash % grp->il_inpcnt;
+
+                       if (grp->il_laddr.s_addr == laddr->s_addr) {
+                               return grp->il_inp[idx];
+                       } else {
+                               if (grp->il_laddr.s_addr == INADDR_ANY &&
+                                       (lookupflags & INPLOOKUP_WILDCARD)) {
+                                       local_wild = grp->il_inp[idx];
+                                       grp_local_wild = grp;
+                               }
+                       }
+               }
+       }
+       if (local_wild != NULL) {
+               return local_wild;
+       }
+       return NULL;
+}
+
 #ifdef PCBGROUP
 /*
  * Lookup PCB in hash list, using pcbgroup tables.
@@ -1948,6 +2243,18 @@ in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, st
                return (tmpinp);
 
        /*
+        * Then look in lb group (for wildcard match)
+        */
+       if (pcbinfo->ipi_lbgrouphashbase != NULL &&
+               (lookupflags & INPLOOKUP_WILDCARD)) {
+               inp = in_pcblookup_lbgroup(pcbinfo, &laddr, lport, &faddr, 
fport,
+                                                                  lookupflags);
+               if (inp != NULL) {
+                       return inp;
+               }
+       }
+
+       /*
         * Then look for a wildcard match, if requested.
         */
        if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
@@ -2164,6 +2471,7 @@ in_pcbinshash_internal(struct inpcb *inp, int do_pcbgr
        struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
        struct inpcbport *phd;
        u_int32_t hashkey_faddr;
+       int so_options;
 
        INP_WLOCK_ASSERT(inp);
        INP_HASH_WLOCK_ASSERT(pcbinfo);
@@ -2184,7 +2492,21 @@ in_pcbinshash_internal(struct inpcb *inp, int do_pcbgr
        pcbporthash = &pcbinfo->ipi_porthashbase[
            INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)];
 
+
        /*
+        * Add entry in lb group
+        * Only do this if SO_REUSEPORT_LB is set
+        */
+       so_options = inp_so_options(inp);
+       if(so_options & SO_REUSEPORT_LB) {
+               int ret = in_pcbinslbgrouphash(inp, pcbinfo);
+               if(ret) {
+                       // pcb lb group malloc fail (ret=ENOBUFS)
+                       return ret;
+               }
+       }
+
+       /*
         * Go through port list and look for a head for this lport.
         */
        LIST_FOREACH(phd, pcbporthash, phd_hash) {
@@ -2310,6 +2632,10 @@ in_pcbremlists(struct inpcb *inp)
                struct inpcbport *phd = inp->inp_phd;
 
                INP_HASH_WLOCK(pcbinfo);
+
+               // XXX Only do if SO_REUSEPORT_LB set?
+               in_pcbremlbgrouphash(inp, pcbinfo);
+
                LIST_REMOVE(inp, inp_hash);
                LIST_REMOVE(inp, inp_portlist);
                if (LIST_FIRST(&phd->phd_pcblist) == NULL) {

Modified: head/sys/netinet/in_pcb.h
==============================================================================
--- head/sys/netinet/in_pcb.h   Mon Apr 23 18:33:26 2018        (r332893)
+++ head/sys/netinet/in_pcb.h   Mon Apr 23 19:51:00 2018        (r332894)
@@ -78,6 +78,11 @@ struct in_addr_4in6 {
        struct  in_addr ia46_addr4;
 };
 
+union in_dependaddr {
+       struct in_addr_4in6 id46_addr;
+       struct in6_addr id6_addr;
+};
+
 /*
  * NOTE: ipv6 addrs should be 64-bit aligned, per RFC 2553.  in_conninfo has
  * some extra padding to accomplish this.
@@ -88,22 +93,14 @@ struct in_endpoints {
        u_int16_t       ie_fport;               /* foreign port */
        u_int16_t       ie_lport;               /* local port */
        /* protocol dependent part, local and foreign addr */
-       union {
-               /* foreign host table entry */
-               struct  in_addr_4in6 ie46_foreign;
-               struct  in6_addr ie6_foreign;
-       } ie_dependfaddr;
-       union {
-               /* local host table entry */
-               struct  in_addr_4in6 ie46_local;
-               struct  in6_addr ie6_local;
-       } ie_dependladdr;
+       union in_dependaddr ie_dependfaddr;     /* foreign host table entry */
+       union in_dependaddr ie_dependladdr;     /* local host table entry */
+#define        ie_faddr        ie_dependfaddr.id46_addr.ia46_addr4
+#define        ie_laddr        ie_dependladdr.id46_addr.ia46_addr4
+#define        ie6_faddr       ie_dependfaddr.id6_addr
+#define        ie6_laddr       ie_dependladdr.id6_addr
        u_int32_t       ie6_zoneid;             /* scope zone id */
 };
-#define        ie_faddr        ie_dependfaddr.ie46_foreign.ia46_addr4
-#define        ie_laddr        ie_dependladdr.ie46_local.ia46_addr4
-#define        ie6_faddr       ie_dependfaddr.ie6_foreign
-#define        ie6_laddr       ie_dependladdr.ie6_local
 
 /*
  * XXX The defines for inc_* are hacks and should be changed to direct
@@ -407,6 +404,21 @@ struct inpcbport {
        u_short phd_port;
 };
 
+struct inpcblbgroup {
+       LIST_ENTRY(inpcblbgroup) il_list;
+       uint16_t        il_lport;
+       u_char          il_vflag;
+       u_char          il_pad;
+       uint32_t        il_pad2;
+       union in_dependaddr il_dependladdr;
+#define il_laddr       il_dependladdr.id46_addr.ia46_addr4
+#define il6_laddr      il_dependladdr.id6_addr
+       uint32_t        il_inpsiz; /* size of il_inp[] */
+       uint32_t        il_inpcnt; /* # of elem in il_inp[] */
+       struct inpcb    *il_inp[];
+};
+LIST_HEAD(inpcblbgrouphead, inpcblbgroup);
+
 /*-
  * Global data structure for each high-level protocol (UDP, TCP, ...) in both
  * IPv4 and IPv6.  Holds inpcb lists and information for managing them.
@@ -500,6 +512,13 @@ struct inpcbinfo {
        u_long                   ipi_wildmask;          /* (p) */
 
        /*
+        * Load balanced group used by the SO_REUSEPORT_LB option,
+        * hashed by local address and local port.
+        */
+       struct  inpcblbgrouphead *ipi_lbgrouphashbase;
+       u_long  ipi_lbgrouphashmask;
+
+       /*
         * Pointer to network stack instance
         */
        struct vnet             *ipi_vnet;              /* (c) */
@@ -585,7 +604,7 @@ struct tcpcb *
        inp_inpcbtotcpcb(struct inpcb *inp);
 void   inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
                uint32_t *faddr, uint16_t *fp);
-short  inp_so_options(const struct inpcb *inp);
+int            inp_so_options(const struct inpcb *inp);
 
 #endif /* _KERNEL */
 
@@ -648,6 +667,10 @@ short      inp_so_options(const struct inpcb *inp);
        (((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport))) & (mask))
 #define INP_PCBPORTHASH(lport, mask) \
        (ntohs((lport)) & (mask))
+#define INP_PCBLBGROUP_PORTHASH(lport, mask) \
+       (ntohs((lport)) & (mask))
+#define INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) \
+       ((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport)))
 #define        INP6_PCBHASHKEY(faddr)  ((faddr)->s6_addr32[3])
 
 /*
@@ -716,6 +739,7 @@ short       inp_so_options(const struct inpcb *inp);
 #define        INP_RATE_LIMIT_CHANGED  0x00000400 /* rate limit needs 
attention */
 #define        INP_ORIGDSTADDR         0x00000800 /* receive IP dst 
address/port */
 #define INP_CANNOT_DO_ECN      0x00001000 /* The stack does not do ECN */
+#define        INP_REUSEPORT_LB        0x00002000 /* SO_REUSEPORT_LB option is 
set */
 
 /*
  * Flags passed to in_pcblookup*() functions.
@@ -818,6 +842,8 @@ struct inpcb *
 struct inpcb *
        in_pcblookup(struct inpcbinfo *, struct in_addr, u_int,
            struct in_addr, u_int, int, struct ifnet *);
+struct inpcb *
+       in_pcblookup_lbgroup_last(const struct inpcb *inp);
 struct inpcb *
        in_pcblookup_mbuf(struct inpcbinfo *, struct in_addr, u_int,
            struct in_addr, u_int, int, struct ifnet *, struct mbuf *);

Modified: head/sys/netinet/ip_output.c
==============================================================================
--- head/sys/netinet/ip_output.c        Mon Apr 23 18:33:26 2018        
(r332893)
+++ head/sys/netinet/ip_output.c        Mon Apr 23 19:51:00 2018        
(r332894)
@@ -986,6 +986,15 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt)
                                INP_WUNLOCK(inp);
                                error = 0;
                                break;
+                       case SO_REUSEPORT_LB:
+                               INP_WLOCK(inp);
+                               if ((so->so_options & SO_REUSEPORT_LB) != 0)
+                                       inp->inp_flags2 |= INP_REUSEPORT_LB;
+                               else
+                                       inp->inp_flags2 &= ~INP_REUSEPORT_LB;
+                               INP_WUNLOCK(inp);
+                               error = 0;
+                               break;
                        case SO_SETFIB:
                                INP_WLOCK(inp);
                                inp->inp_inc.inc_fibnum = so->so_fibnum;

Modified: head/sys/netinet/tcp_subr.c
==============================================================================
--- head/sys/netinet/tcp_subr.c Mon Apr 23 18:33:26 2018        (r332893)
+++ head/sys/netinet/tcp_subr.c Mon Apr 23 19:51:00 2018        (r332894)
@@ -1956,10 +1956,28 @@ tcp_close(struct tcpcb *tp)
 {
        struct inpcb *inp = tp->t_inpcb;
        struct socket *so;
+       struct inpcb *inp_inh = NULL;
+       int listen = tp->t_state & TCPS_LISTEN;
 
        INP_INFO_LOCK_ASSERT(&V_tcbinfo);
        INP_WLOCK_ASSERT(inp);
 
+       if (listen) {
+               /*
+                * Pending socket/syncache inheritance
+                *
+                * If this is a listen(2) socket, find another listen(2)
+                * socket in the same local group, which could inherit
+                * the syncache and sockets pending on the completion
+                * and incompletion queues.
+                *
+                * NOTE:
+                * Currently the inheritance could only happen on the
+                * listen(2) sockets with SO_REUSEPORT_LB set.
+                */
+               inp_inh = in_pcblookup_lbgroup_last(inp);
+       }
+
 #ifdef TCP_OFFLOAD
        if (tp->t_state == TCPS_LISTEN)
                tcp_offload_listen_stop(tp);
@@ -1979,7 +1997,16 @@ tcp_close(struct tcpcb *tp)
                tcp_state_change(tp, TCPS_CLOSED);
        KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL"));
        so = inp->inp_socket;
+
        soisdisconnected(so);
+
+       if(listen)
+       {
+               if(inp_inh != NULL && inp_inh->inp_socket != NULL) {
+                       soinherit(so, inp_inh->inp_socket);
+               }
+       }
+
        if (inp->inp_flags & INP_SOCKREF) {
                KASSERT(so->so_state & SS_PROTOREF,
                    ("tcp_close: !SS_PROTOREF"));

Modified: head/sys/netinet/udp_usrreq.c
==============================================================================
--- head/sys/netinet/udp_usrreq.c       Mon Apr 23 18:33:26 2018        
(r332893)
+++ head/sys/netinet/udp_usrreq.c       Mon Apr 23 19:51:00 2018        
(r332894)
@@ -612,7 +612,7 @@ udp_input(struct mbuf **mp, int *offp, int proto)
                         * will never clear these options after setting them.
                         */
                        if ((last->inp_socket->so_options &
-                           (SO_REUSEPORT|SO_REUSEADDR)) == 0)
+                           (SO_REUSEPORT|SO_REUSEPORT_LB|SO_REUSEADDR)) == 0)
                                break;
                }
 

Modified: head/sys/netinet6/in6_pcb.c
==============================================================================
--- head/sys/netinet6/in6_pcb.c Mon Apr 23 18:33:26 2018        (r332893)
+++ head/sys/netinet6/in6_pcb.c Mon Apr 23 19:51:00 2018        (r332894)
@@ -125,6 +125,12 @@ in6_pcbbind(struct inpcb *inp, struct sockaddr *nam,
        int error, lookupflags = 0;
        int reuseport = (so->so_options & SO_REUSEPORT);
 
+       /*
+        * XXX Maybe we could let SO_REUSEPORT_LB set SO_REUSEPORT bit here
+        * so that we don't have to add to the (already messy) code below
+        */
+       int reuseport_lb = (so->so_options & SO_REUSEPORT_LB);
+
        INP_WLOCK_ASSERT(inp);
        INP_HASH_WLOCK_ASSERT(pcbinfo);
 
@@ -132,7 +138,7 @@ in6_pcbbind(struct inpcb *inp, struct sockaddr *nam,
                return (EADDRNOTAVAIL);
        if (inp->inp_lport || !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
                return (EINVAL);
-       if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0)
+       if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0)
                lookupflags = INPLOOKUP_WILDCARD;
        if (nam == NULL) {
                if ((error = prison_local_ip6(cred, &inp->in6p_laddr,
@@ -166,6 +172,10 @@ in6_pcbbind(struct inpcb *inp, struct sockaddr *nam,
                         */
                        if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0)
                                reuseport = SO_REUSEADDR|SO_REUSEPORT;
+                       // XXX: How to deal with SO_REUSEPORT_LB here?
+                       // Added equivalent treatment as SO_REUSEPORT here for 
now
+                       if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT_LB)) 
!= 0)
+                               reuseport_lb = SO_REUSEADDR|SO_REUSEPORT_LB;
                } else if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
                        struct ifaddr *ifa;
 
@@ -214,7 +224,8 @@ in6_pcbbind(struct inpcb *inp, struct sockaddr *nam,
                                     IN6_IS_ADDR_UNSPECIFIED(&t->in6p_faddr)) &&
                                    (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) 
||
                                     !IN6_IS_ADDR_UNSPECIFIED(&t->in6p_laddr) ||
-                                    (t->inp_flags2 & INP_REUSEPORT) == 0) &&
+                                    (t->inp_flags2 & INP_REUSEPORT) ||
+                                    (t->inp_flags2 & INP_REUSEPORT_LB) == 0) &&
                                    (inp->inp_cred->cr_uid !=
                                     t->inp_cred->cr_uid))
                                        return (EADDRINUSE);
@@ -264,34 +275,39 @@ in6_pcbbind(struct inpcb *inp, struct sockaddr *nam,
                                 */
                                tw = intotw(t);
                                if (tw == NULL ||
-                                   (reuseport & tw->tw_so_options) == 0)
+                                   ((reuseport & tw->tw_so_options) == 0 &&
+                                        (reuseport_lb & tw->tw_so_options) == 
0))
                                        return (EADDRINUSE);
-                       } else if (t && (reuseport & inp_so_options(t)) == 0) {
+                       } else if (t && (reuseport & inp_so_options(t)) == 0 &&
+                                          (reuseport_lb & inp_so_options(t)) 
== 0) {
                                return (EADDRINUSE);
                        }
 #ifdef INET
                        if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0 &&
-                           IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
+                               IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
                                struct sockaddr_in sin;
 
                                in6_sin6_2_sin(&sin, sin6);
                                t = in_pcblookup_local(pcbinfo, sin.sin_addr,
-                                   lport, lookupflags, cred);
+                                                                          
lport, lookupflags, cred);
                                if (t && t->inp_flags & INP_TIMEWAIT) {
                                        tw = intotw(t);
                                        if (tw == NULL)
                                                return (EADDRINUSE);
                                        if ((reuseport & tw->tw_so_options) == 0
-                                           && (ntohl(t->inp_laddr.s_addr) !=
-                                            INADDR_ANY || ((inp->inp_vflag &
-                                            INP_IPV6PROTO) ==
-                                            (t->inp_vflag & INP_IPV6PROTO))))
+                                               && (reuseport_lb & 
tw->tw_so_options) == 0
+                                               && (ntohl(t->inp_laddr.s_addr) 
!=
+                                                       INADDR_ANY || 
((inp->inp_vflag &
+                                                                               
        INP_IPV6PROTO) ==
+                                                                               
   (t->inp_vflag & INP_IPV6PROTO))))
                                                return (EADDRINUSE);
                                } else if (t &&
-                                   (reuseport & inp_so_options(t)) == 0 &&
-                                   (ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
-                                   (t->inp_vflag & INP_IPV6PROTO) != 0))
+                                                  (reuseport & 
inp_so_options(t)) == 0 &&
+                                                  (reuseport_lb & 
inp_so_options(t)) == 0 &&
+                                                  (ntohl(t->inp_laddr.s_addr) 
!= INADDR_ANY ||
+                                                       (t->inp_vflag & 
INP_IPV6PROTO) != 0)) {
                                        return (EADDRINUSE);
+                               }
                        }
 #endif
                }
@@ -856,6 +872,54 @@ in6_rtchange(struct inpcb *inp, int errno)
        return inp;
 }
 
+static struct inpcb *
+in6_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
+  const struct in6_addr *laddr, uint16_t lport, const struct in6_addr *faddr,
+  uint16_t fport, int lookupflags)
+{
+       struct inpcb *local_wild = NULL;
+       const struct inpcblbgrouphead *hdr;
+       struct inpcblbgroup *grp;
+       struct inpcblbgroup *grp_local_wild;
+
+       hdr = &pcbinfo->ipi_lbgrouphashbase[
+                 INP_PCBLBGROUP_PORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)];
+
+       /*
+        * Order of socket selection:
+        * 1. non-wild.
+        * 2. wild (if lookupflags contains INPLOOKUP_WILDCARD).
+        *
+        * NOTE:
+        * - Load balanced group does not contain jailed sockets
+        * - Load balanced does not contain IPv4 mapped INET6 wild sockets
+        */
+       LIST_FOREACH(grp, hdr, il_list) {
+
+               if (grp->il_lport == lport) {
+                       uint32_t idx = 0;
+                       int pkt_hash = INP_PCBLBGROUP_PKTHASH(
+                                                      INP6_PCBHASHKEY(faddr), 
lport, fport);
+
+                       idx = pkt_hash % grp->il_inpcnt;
+
+                       if (IN6_ARE_ADDR_EQUAL(&grp->il6_laddr, laddr)) {
+                               return grp->il_inp[idx];
+                       } else {
+                               if (IN6_IS_ADDR_UNSPECIFIED(&grp->il6_laddr) &&
+                                       (lookupflags & INPLOOKUP_WILDCARD)) {
+                                       local_wild = grp->il_inp[idx];
+                                       grp_local_wild = grp;
+                               }
+                       }
+               }
+       }
+       if (local_wild != NULL) {
+               return local_wild;
+       }
+       return NULL;
+}
+
 #ifdef PCBGROUP
 /*
  * Lookup PCB in hash list, using pcbgroup tables.
@@ -1057,6 +1121,8 @@ found:

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
_______________________________________________
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"

Reply via email to