The branch main has been updated by glebius:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=5f539170784cf8aed6989e055ff3ac190d0de80b

commit 5f539170784cf8aed6989e055ff3ac190d0de80b
Author:     Gleb Smirnoff <gleb...@freebsd.org>
AuthorDate: 2025-03-07 06:58:35 +0000
Commit:     Gleb Smirnoff <gleb...@freebsd.org>
CommitDate: 2025-03-07 06:58:35 +0000

    inpcb: retire two-level port hash database
    
    This structure originates from the pre-FreeBSD times when system RAM was
    measured in single digits of MB and Internet speeds were measured in Kb.
    At first level the database hashes the port value only to calculate index
    into array of pointers to lazily allocated headers that hold lists of
    inpcbs with the same local port.  This design apparently was made to
    preserve kernel memory.
    
    In the modern kernel size of the first level of the hash is derived from
    maxsockets, which is derived from maxfiles, which in its turn is derived
    from amount of physical memory.  Then the size of the hash is capped by
    IPPORT_MAX, cause it doesn't make any sense to have hash table larger then
    the set of possible values.  In practice this cap works even on my laptop.
    I haven't done precise calculation or experiments, but my guess is that
    any system with > 8 Gb of RAM will be autotuned to IPPORT_MAX sized hash.
    Apparently, this hash is a degenerate one: it never has more than one
    entries in any slot.  You can check this with kgdb:
    
        set $i = 0
        while ($i <= tcbinfo->ipi_porthashmask)
            set $p = tcbinfo->ipi_porthashbase[$i].clh_first
            set $c = 0
            while ($p != 0)
                set $c = $c + 1
                set $p = $p->phd_hash.cle_next
            end
            if ($c > 1)
                printf "Slot %u count %u", $i, $c
            end
            set $i = $i + 1
        end
    
    Retiring the two level hash we remove a lot of complexity at the cost of
    only one comparison 'inp->inp_lport != lport' in the lookup cycle, which
    is going to be always false on most machines anyway. This comparison
    definitely shall be cheaper than extra pointer traversal.
    
    Another positive change to be singled out is that now we no longer need to
    allocate memory in non-sleepable context in in_pcbinshash(), so a
    potential ENOMEM on connect(2) is removed.
    
    Reviewed by:            markj
    Differential Revision:  https://reviews.freebsd.org/D49151
---
 sys/netinet/in_pcb.c     | 143 +++++++++++++++--------------------------------
 sys/netinet/in_pcb.h     |   7 +--
 sys/netinet/in_pcb_var.h |   6 --
 sys/netinet6/in6_pcb.c   |  75 +++++++++++--------------
 4 files changed, 79 insertions(+), 152 deletions(-)

diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c
index fa2c60b93cfa..08097ea8c1b9 100644
--- a/sys/netinet/in_pcb.c
+++ b/sys/netinet/in_pcb.c
@@ -576,7 +576,6 @@ in_pcbinfo_init(struct inpcbinfo *pcbinfo, struct 
inpcbstorage *pcbstor,
        pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB,
            &pcbinfo->ipi_lbgrouphashmask);
        pcbinfo->ipi_zone = pcbstor->ips_zone;
-       pcbinfo->ipi_portzone = pcbstor->ips_portzone;
        pcbinfo->ipi_smr = uma_zone_get_smr(pcbinfo->ipi_zone);
 }
 
@@ -612,10 +611,6 @@ in_pcbstorage_init(void *arg)
        pcbstor->ips_zone = uma_zcreate(pcbstor->ips_zone_name,
            pcbstor->ips_size, NULL, NULL, pcbstor->ips_pcbinit,
            inpcb_fini, UMA_ALIGN_CACHE, UMA_ZONE_SMR);
-       pcbstor->ips_portzone = uma_zcreate(pcbstor->ips_portzone_name,
-           sizeof(struct inpcbport), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
-       uma_zone_set_smr(pcbstor->ips_portzone,
-           uma_zone_get_smr(pcbstor->ips_zone));
 }
 
 /*
@@ -627,7 +622,6 @@ in_pcbstorage_destroy(void *arg)
        struct inpcbstorage *pcbstor = arg;
 
        uma_zdestroy(pcbstor->ips_zone);
-       uma_zdestroy(pcbstor->ips_portzone);
 }
 
 /*
@@ -2028,71 +2022,58 @@ in_pcblookup_local(struct inpcbinfo *pcbinfo, struct 
in_addr laddr,
                 */
                return (NULL);
        } else {
-               struct inpcbporthead *porthash;
-               struct inpcbport *phd;
+               struct inpcbhead *porthash;
                struct inpcb *match = NULL;
+
                /*
-                * Best fit PCB lookup.
-                *
-                * First see if this local port is in use by looking on the
-                * port hash list.
+                * Port is in use by one or more PCBs. Look for best
+                * fit.
                 */
                porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport,
                    pcbinfo->ipi_porthashmask)];
-               CK_LIST_FOREACH(phd, porthash, phd_hash) {
-                       if (phd->phd_port == lport)
-                               break;
-               }
-               if (phd != NULL) {
+               CK_LIST_FOREACH(inp, porthash, inp_portlist) {
+                       if (inp->inp_lport != lport)
+                               continue;
+                       if (!prison_equal_ip4(inp->inp_cred->cr_prison,
+                           cred->cr_prison))
+                               continue;
+                       if (fib != RT_ALL_FIBS &&
+                           inp->inp_inc.inc_fibnum != fib)
+                               continue;
+                       wildcard = 0;
+#ifdef INET6
+                       /* XXX inp locking */
+                       if ((inp->inp_vflag & INP_IPV4) == 0)
+                               continue;
                        /*
-                        * Port is in use by one or more PCBs. Look for best
-                        * fit.
+                        * We never select the PCB that has INP_IPV6 flag and
+                        * is bound to :: if we have another PCB which is bound
+                        * to 0.0.0.0.  If a PCB has the INP_IPV6 flag, then we
+                        * set its cost higher than IPv4 only PCBs.
+                        *
+                        * Note that the case only happens when a socket is
+                        * bound to ::, under the condition that the use of the
+                        * mapped address is allowed.
                         */
-                       CK_LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
-                               wildcard = 0;
-                               if (!prison_equal_ip4(inp->inp_cred->cr_prison,
-                                   cred->cr_prison))
-                                       continue;
-                               if (fib != RT_ALL_FIBS &&
-                                   inp->inp_inc.inc_fibnum != fib)
-                                       continue;
-#ifdef INET6
-                               /* XXX inp locking */
-                               if ((inp->inp_vflag & INP_IPV4) == 0)
-                                       continue;
-                               /*
-                                * We never select the PCB that has
-                                * INP_IPV6 flag and is bound to :: if
-                                * we have another PCB which is bound
-                                * to 0.0.0.0.  If a PCB has the
-                                * INP_IPV6 flag, then we set its cost
-                                * higher than IPv4 only PCBs.
-                                *
-                                * Note that the case only happens
-                                * when a socket is bound to ::, under
-                                * the condition that the use of the
-                                * mapped address is allowed.
-                                */
-                               if ((inp->inp_vflag & INP_IPV6) != 0)
-                                       wildcard += INP_LOOKUP_MAPPED_PCB_COST;
+                       if ((inp->inp_vflag & INP_IPV6) != 0)
+                               wildcard += INP_LOOKUP_MAPPED_PCB_COST;
 #endif
-                               if (inp->inp_faddr.s_addr != INADDR_ANY)
+                       if (inp->inp_faddr.s_addr != INADDR_ANY)
+                               wildcard++;
+                       if (inp->inp_laddr.s_addr != INADDR_ANY) {
+                               if (laddr.s_addr == INADDR_ANY)
+                                       wildcard++;
+                               else if (inp->inp_laddr.s_addr != laddr.s_addr)
+                                       continue;
+                       } else {
+                               if (laddr.s_addr != INADDR_ANY)
                                        wildcard++;
-                               if (inp->inp_laddr.s_addr != INADDR_ANY) {
-                                       if (laddr.s_addr == INADDR_ANY)
-                                               wildcard++;
-                                       else if (inp->inp_laddr.s_addr != 
laddr.s_addr)
-                                               continue;
-                               } else {
-                                       if (laddr.s_addr != INADDR_ANY)
-                                               wildcard++;
-                               }
-                               if (wildcard < matchwild) {
-                                       match = inp;
-                                       matchwild = wildcard;
-                                       if (matchwild == 0)
-                                               break;
-                               }
+                       }
+                       if (wildcard < matchwild) {
+                               match = inp;
+                               matchwild = wildcard;
+                               if (matchwild == 0)
+                                       break;
                        }
                }
                return (match);
@@ -2642,10 +2623,8 @@ _in6_pcbinshash_wild(struct inpcbhead *pcbhash, struct 
inpcb *inp)
 int
 in_pcbinshash(struct inpcb *inp)
 {
-       struct inpcbhead *pcbhash;
-       struct inpcbporthead *pcbporthash;
+       struct inpcbhead *pcbhash, *pcbporthash;
        struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
-       struct inpcbport *phd;
        uint32_t hash;
        bool connected;
 
@@ -2685,31 +2664,6 @@ in_pcbinshash(struct inpcb *inp)
                        return (error);
        }
 
-       /*
-        * Go through port list and look for a head for this lport.
-        */
-       CK_LIST_FOREACH(phd, pcbporthash, phd_hash) {
-               if (phd->phd_port == inp->inp_lport)
-                       break;
-       }
-
-       /*
-        * If none exists, malloc one and tack it on.
-        */
-       if (phd == NULL) {
-               phd = uma_zalloc_smr(pcbinfo->ipi_portzone, M_NOWAIT);
-               if (phd == NULL) {
-                       if ((inp->inp_flags & INP_INLBGROUP) != 0)
-                               in_pcbremlbgrouphash(inp);
-                       return (ENOMEM);
-               }
-               phd->phd_port = inp->inp_lport;
-               CK_LIST_INIT(&phd->phd_pcblist);
-               CK_LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
-       }
-       inp->inp_phd = phd;
-       CK_LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
-
        /*
         * The PCB may have been disconnected in the past.  Before we can safely
         * make it visible in the hash table, we must wait for all readers which
@@ -2730,6 +2684,7 @@ in_pcbinshash(struct inpcb *inp)
 #endif
                        _in_pcbinshash_wild(pcbhash, inp);
        }
+       CK_LIST_INSERT_HEAD(pcbporthash, inp, inp_portlist);
        inp->inp_flags |= INP_INHASHLIST;
 
        return (0);
@@ -2738,7 +2693,6 @@ in_pcbinshash(struct inpcb *inp)
 void
 in_pcbremhash_locked(struct inpcb *inp)
 {
-       struct inpcbport *phd = inp->inp_phd;
 
        INP_WLOCK_ASSERT(inp);
        INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
@@ -2761,10 +2715,6 @@ in_pcbremhash_locked(struct inpcb *inp)
                        CK_LIST_REMOVE(inp, inp_hash_exact);
        }
        CK_LIST_REMOVE(inp, inp_portlist);
-       if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) {
-               CK_LIST_REMOVE(phd, phd_hash);
-               uma_zfree_smr(inp->inp_pcbinfo->ipi_portzone, phd);
-       }
        inp->inp_flags &= ~INP_INHASHLIST;
 }
 
@@ -3275,8 +3225,7 @@ db_print_inpcb(struct inpcb *inp, const char *name, int 
indent)
        }
 
        db_print_indent(indent);
-       db_printf("inp_phd: %p   inp_gencnt: %ju\n", inp->inp_phd,
-           (uintmax_t)inp->inp_gencnt);
+       db_printf("inp_gencnt: %ju\n", (uintmax_t)inp->inp_gencnt);
 }
 
 DB_SHOW_COMMAND(inpcb, db_show_inpcb)
diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h
index c2b90de2ef54..5fe12c4f1e76 100644
--- a/sys/netinet/in_pcb.h
+++ b/sys/netinet/in_pcb.h
@@ -64,7 +64,6 @@
  * protocol-specific control block) are stored here.
  */
 CK_LIST_HEAD(inpcbhead, inpcb);
-CK_LIST_HEAD(inpcbporthead, inpcbport);
 CK_LIST_HEAD(inpcblbgrouphead, inpcblbgroup);
 typedef        uint64_t        inp_gen_t;
 
@@ -221,7 +220,6 @@ struct inpcb {
                short   in6p_hops;
        };
        CK_LIST_ENTRY(inpcb) inp_portlist;      /* (r:e/w:h) port list */
-       struct  inpcbport *inp_phd;     /* (r:e/w:h) head of this list */
        inp_gen_t       inp_gencnt;     /* (c) generation count */
        void            *spare_ptr;     /* Spare pointer. */
        rt_gen_t        inp_rt_cookie;  /* generation for route entry */
@@ -370,7 +368,7 @@ struct inpcbinfo {
        /*
         * Global hash of inpcbs, hashed by only local port number.
         */
-       struct inpcbporthead    *ipi_porthashbase;      /* (h) */
+       struct inpcbhead        *ipi_porthashbase;      /* (h) */
        u_long                   ipi_porthashmask;      /* (h) */
 
        /*
@@ -392,11 +390,9 @@ struct inpcbinfo {
  */
 struct inpcbstorage {
        uma_zone_t      ips_zone;
-       uma_zone_t      ips_portzone;
        uma_init        ips_pcbinit;
        size_t          ips_size;
        const char *    ips_zone_name;
-       const char *    ips_portzone_name;
        const char *    ips_infolock_name;
        const char *    ips_hashlock_name;
 };
@@ -414,7 +410,6 @@ static struct inpcbstorage prot = {                         
        \
        .ips_size = sizeof(struct ppcb),                                \
        .ips_pcbinit = prot##_inpcb_init,                               \
        .ips_zone_name = zname,                                         \
-       .ips_portzone_name = zname " ports",                            \
        .ips_infolock_name = iname,                                     \
        .ips_hashlock_name = hname,                                     \
 };                                                                     \
diff --git a/sys/netinet/in_pcb_var.h b/sys/netinet/in_pcb_var.h
index fb88dfec889e..7e8a1626ab40 100644
--- a/sys/netinet/in_pcb_var.h
+++ b/sys/netinet/in_pcb_var.h
@@ -59,12 +59,6 @@ int     in_pcbinshash(struct inpcb *);
 void    in_pcbrehash(struct inpcb *);
 void    in_pcbremhash_locked(struct inpcb *);
 
-struct inpcbport {
-       struct inpcbhead phd_pcblist;
-       CK_LIST_ENTRY(inpcbport) phd_hash;
-       u_short phd_port;
-};
-
 /*
  * Load balance groups used for the SO_REUSEPORT_LB socket option. Each group
  * (or unique address:port combination) can be re-used at most
diff --git a/sys/netinet6/in6_pcb.c b/sys/netinet6/in6_pcb.c
index 64c886ca2ed5..e77a1e9d3e87 100644
--- a/sys/netinet6/in6_pcb.c
+++ b/sys/netinet6/in6_pcb.c
@@ -767,56 +767,45 @@ in6_pcblookup_local(struct inpcbinfo *pcbinfo, const 
struct in6_addr *laddr,
                 */
                return (NULL);
        } else {
-               struct inpcbporthead *porthash;
-               struct inpcbport *phd;
+               struct inpcbhead *porthash;
                struct inpcb *match = NULL;
+
                /*
-                * Best fit PCB lookup.
-                *
-                * First see if this local port is in use by looking on the
-                * port hash list.
+                * Port is in use by one or more PCBs. Look for best
+                * fit.
                 */
                porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport,
                    pcbinfo->ipi_porthashmask)];
-               CK_LIST_FOREACH(phd, porthash, phd_hash) {
-                       if (phd->phd_port == lport)
-                               break;
-               }
-               if (phd != NULL) {
-                       /*
-                        * Port is in use by one or more PCBs. Look for best
-                        * fit.
-                        */
-                       CK_LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
-                               wildcard = 0;
-                               if (!prison_equal_ip6(cred->cr_prison,
-                                   inp->inp_cred->cr_prison))
-                                       continue;
-                               /* XXX inp locking */
-                               if ((inp->inp_vflag & INP_IPV6) == 0)
-                                       continue;
-                               if (fib != RT_ALL_FIBS &&
-                                   inp->inp_inc.inc_fibnum != fib)
+               CK_LIST_FOREACH(inp, porthash, inp_portlist) {
+                       if (inp->inp_lport != lport)
+                               continue;
+                       if (!prison_equal_ip6(cred->cr_prison,
+                           inp->inp_cred->cr_prison))
+                               continue;
+                       /* XXX inp locking */
+                       if ((inp->inp_vflag & INP_IPV6) == 0)
+                               continue;
+                       if (fib != RT_ALL_FIBS &&
+                           inp->inp_inc.inc_fibnum != fib)
+                               continue;
+                       wildcard = 0;
+                       if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr))
+                               wildcard++;
+                       if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) {
+                               if (IN6_IS_ADDR_UNSPECIFIED(laddr))
+                                       wildcard++;
+                               else if (!IN6_ARE_ADDR_EQUAL(
+                                   &inp->in6p_laddr, laddr))
                                        continue;
-                               if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr))
+                       } else {
+                               if (!IN6_IS_ADDR_UNSPECIFIED(laddr))
                                        wildcard++;
-                               if (!IN6_IS_ADDR_UNSPECIFIED(
-                                       &inp->in6p_laddr)) {
-                                       if (IN6_IS_ADDR_UNSPECIFIED(laddr))
-                                               wildcard++;
-                                       else if (!IN6_ARE_ADDR_EQUAL(
-                                           &inp->in6p_laddr, laddr))
-                                               continue;
-                               } else {
-                                       if (!IN6_IS_ADDR_UNSPECIFIED(laddr))
-                                               wildcard++;
-                               }
-                               if (wildcard < matchwild) {
-                                       match = inp;
-                                       matchwild = wildcard;
-                                       if (matchwild == 0)
-                                               break;
-                               }
+                       }
+                       if (wildcard < matchwild) {
+                               match = inp;
+                               matchwild = wildcard;
+                               if (matchwild == 0)
+                                       break;
                        }
                }
                return (match);

Reply via email to