The branch main has been updated by glebius: URL: https://cgit.FreeBSD.org/src/commit/?id=5f539170784cf8aed6989e055ff3ac190d0de80b
commit 5f539170784cf8aed6989e055ff3ac190d0de80b Author: Gleb Smirnoff <gleb...@freebsd.org> AuthorDate: 2025-03-07 06:58:35 +0000 Commit: Gleb Smirnoff <gleb...@freebsd.org> CommitDate: 2025-03-07 06:58:35 +0000 inpcb: retire two-level port hash database This structure originates from the pre-FreeBSD times when system RAM was measured in single digits of MB and Internet speeds were measured in Kb. At first level the database hashes the port value only to calculate index into array of pointers to lazily allocated headers that hold lists of inpcbs with the same local port. This design apparently was made to preserve kernel memory. In the modern kernel size of the first level of the hash is derived from maxsockets, which is derived from maxfiles, which in its turn is derived from amount of physical memory. Then the size of the hash is capped by IPPORT_MAX, cause it doesn't make any sense to have hash table larger then the set of possible values. In practice this cap works even on my laptop. I haven't done precise calculation or experiments, but my guess is that any system with > 8 Gb of RAM will be autotuned to IPPORT_MAX sized hash. Apparently, this hash is a degenerate one: it never has more than one entries in any slot. You can check this with kgdb: set $i = 0 while ($i <= tcbinfo->ipi_porthashmask) set $p = tcbinfo->ipi_porthashbase[$i].clh_first set $c = 0 while ($p != 0) set $c = $c + 1 set $p = $p->phd_hash.cle_next end if ($c > 1) printf "Slot %u count %u", $i, $c end set $i = $i + 1 end Retiring the two level hash we remove a lot of complexity at the cost of only one comparison 'inp->inp_lport != lport' in the lookup cycle, which is going to be always false on most machines anyway. This comparison definitely shall be cheaper than extra pointer traversal. Another positive change to be singled out is that now we no longer need to allocate memory in non-sleepable context in in_pcbinshash(), so a potential ENOMEM on connect(2) is removed. Reviewed by: markj Differential Revision: https://reviews.freebsd.org/D49151 --- sys/netinet/in_pcb.c | 143 +++++++++++++++-------------------------------- sys/netinet/in_pcb.h | 7 +-- sys/netinet/in_pcb_var.h | 6 -- sys/netinet6/in6_pcb.c | 75 +++++++++++-------------- 4 files changed, 79 insertions(+), 152 deletions(-) diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c index fa2c60b93cfa..08097ea8c1b9 100644 --- a/sys/netinet/in_pcb.c +++ b/sys/netinet/in_pcb.c @@ -576,7 +576,6 @@ in_pcbinfo_init(struct inpcbinfo *pcbinfo, struct inpcbstorage *pcbstor, pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB, &pcbinfo->ipi_lbgrouphashmask); pcbinfo->ipi_zone = pcbstor->ips_zone; - pcbinfo->ipi_portzone = pcbstor->ips_portzone; pcbinfo->ipi_smr = uma_zone_get_smr(pcbinfo->ipi_zone); } @@ -612,10 +611,6 @@ in_pcbstorage_init(void *arg) pcbstor->ips_zone = uma_zcreate(pcbstor->ips_zone_name, pcbstor->ips_size, NULL, NULL, pcbstor->ips_pcbinit, inpcb_fini, UMA_ALIGN_CACHE, UMA_ZONE_SMR); - pcbstor->ips_portzone = uma_zcreate(pcbstor->ips_portzone_name, - sizeof(struct inpcbport), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); - uma_zone_set_smr(pcbstor->ips_portzone, - uma_zone_get_smr(pcbstor->ips_zone)); } /* @@ -627,7 +622,6 @@ in_pcbstorage_destroy(void *arg) struct inpcbstorage *pcbstor = arg; uma_zdestroy(pcbstor->ips_zone); - uma_zdestroy(pcbstor->ips_portzone); } /* @@ -2028,71 +2022,58 @@ in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, */ return (NULL); } else { - struct inpcbporthead *porthash; - struct inpcbport *phd; + struct inpcbhead *porthash; struct inpcb *match = NULL; + /* - * Best fit PCB lookup. - * - * First see if this local port is in use by looking on the - * port hash list. + * Port is in use by one or more PCBs. Look for best + * fit. */ porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport, pcbinfo->ipi_porthashmask)]; - CK_LIST_FOREACH(phd, porthash, phd_hash) { - if (phd->phd_port == lport) - break; - } - if (phd != NULL) { + CK_LIST_FOREACH(inp, porthash, inp_portlist) { + if (inp->inp_lport != lport) + continue; + if (!prison_equal_ip4(inp->inp_cred->cr_prison, + cred->cr_prison)) + continue; + if (fib != RT_ALL_FIBS && + inp->inp_inc.inc_fibnum != fib) + continue; + wildcard = 0; +#ifdef INET6 + /* XXX inp locking */ + if ((inp->inp_vflag & INP_IPV4) == 0) + continue; /* - * Port is in use by one or more PCBs. Look for best - * fit. + * We never select the PCB that has INP_IPV6 flag and + * is bound to :: if we have another PCB which is bound + * to 0.0.0.0. If a PCB has the INP_IPV6 flag, then we + * set its cost higher than IPv4 only PCBs. + * + * Note that the case only happens when a socket is + * bound to ::, under the condition that the use of the + * mapped address is allowed. */ - CK_LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) { - wildcard = 0; - if (!prison_equal_ip4(inp->inp_cred->cr_prison, - cred->cr_prison)) - continue; - if (fib != RT_ALL_FIBS && - inp->inp_inc.inc_fibnum != fib) - continue; -#ifdef INET6 - /* XXX inp locking */ - if ((inp->inp_vflag & INP_IPV4) == 0) - continue; - /* - * We never select the PCB that has - * INP_IPV6 flag and is bound to :: if - * we have another PCB which is bound - * to 0.0.0.0. If a PCB has the - * INP_IPV6 flag, then we set its cost - * higher than IPv4 only PCBs. - * - * Note that the case only happens - * when a socket is bound to ::, under - * the condition that the use of the - * mapped address is allowed. - */ - if ((inp->inp_vflag & INP_IPV6) != 0) - wildcard += INP_LOOKUP_MAPPED_PCB_COST; + if ((inp->inp_vflag & INP_IPV6) != 0) + wildcard += INP_LOOKUP_MAPPED_PCB_COST; #endif - if (inp->inp_faddr.s_addr != INADDR_ANY) + if (inp->inp_faddr.s_addr != INADDR_ANY) + wildcard++; + if (inp->inp_laddr.s_addr != INADDR_ANY) { + if (laddr.s_addr == INADDR_ANY) + wildcard++; + else if (inp->inp_laddr.s_addr != laddr.s_addr) + continue; + } else { + if (laddr.s_addr != INADDR_ANY) wildcard++; - if (inp->inp_laddr.s_addr != INADDR_ANY) { - if (laddr.s_addr == INADDR_ANY) - wildcard++; - else if (inp->inp_laddr.s_addr != laddr.s_addr) - continue; - } else { - if (laddr.s_addr != INADDR_ANY) - wildcard++; - } - if (wildcard < matchwild) { - match = inp; - matchwild = wildcard; - if (matchwild == 0) - break; - } + } + if (wildcard < matchwild) { + match = inp; + matchwild = wildcard; + if (matchwild == 0) + break; } } return (match); @@ -2642,10 +2623,8 @@ _in6_pcbinshash_wild(struct inpcbhead *pcbhash, struct inpcb *inp) int in_pcbinshash(struct inpcb *inp) { - struct inpcbhead *pcbhash; - struct inpcbporthead *pcbporthash; + struct inpcbhead *pcbhash, *pcbporthash; struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; - struct inpcbport *phd; uint32_t hash; bool connected; @@ -2685,31 +2664,6 @@ in_pcbinshash(struct inpcb *inp) return (error); } - /* - * Go through port list and look for a head for this lport. - */ - CK_LIST_FOREACH(phd, pcbporthash, phd_hash) { - if (phd->phd_port == inp->inp_lport) - break; - } - - /* - * If none exists, malloc one and tack it on. - */ - if (phd == NULL) { - phd = uma_zalloc_smr(pcbinfo->ipi_portzone, M_NOWAIT); - if (phd == NULL) { - if ((inp->inp_flags & INP_INLBGROUP) != 0) - in_pcbremlbgrouphash(inp); - return (ENOMEM); - } - phd->phd_port = inp->inp_lport; - CK_LIST_INIT(&phd->phd_pcblist); - CK_LIST_INSERT_HEAD(pcbporthash, phd, phd_hash); - } - inp->inp_phd = phd; - CK_LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist); - /* * The PCB may have been disconnected in the past. Before we can safely * make it visible in the hash table, we must wait for all readers which @@ -2730,6 +2684,7 @@ in_pcbinshash(struct inpcb *inp) #endif _in_pcbinshash_wild(pcbhash, inp); } + CK_LIST_INSERT_HEAD(pcbporthash, inp, inp_portlist); inp->inp_flags |= INP_INHASHLIST; return (0); @@ -2738,7 +2693,6 @@ in_pcbinshash(struct inpcb *inp) void in_pcbremhash_locked(struct inpcb *inp) { - struct inpcbport *phd = inp->inp_phd; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); @@ -2761,10 +2715,6 @@ in_pcbremhash_locked(struct inpcb *inp) CK_LIST_REMOVE(inp, inp_hash_exact); } CK_LIST_REMOVE(inp, inp_portlist); - if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) { - CK_LIST_REMOVE(phd, phd_hash); - uma_zfree_smr(inp->inp_pcbinfo->ipi_portzone, phd); - } inp->inp_flags &= ~INP_INHASHLIST; } @@ -3275,8 +3225,7 @@ db_print_inpcb(struct inpcb *inp, const char *name, int indent) } db_print_indent(indent); - db_printf("inp_phd: %p inp_gencnt: %ju\n", inp->inp_phd, - (uintmax_t)inp->inp_gencnt); + db_printf("inp_gencnt: %ju\n", (uintmax_t)inp->inp_gencnt); } DB_SHOW_COMMAND(inpcb, db_show_inpcb) diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h index c2b90de2ef54..5fe12c4f1e76 100644 --- a/sys/netinet/in_pcb.h +++ b/sys/netinet/in_pcb.h @@ -64,7 +64,6 @@ * protocol-specific control block) are stored here. */ CK_LIST_HEAD(inpcbhead, inpcb); -CK_LIST_HEAD(inpcbporthead, inpcbport); CK_LIST_HEAD(inpcblbgrouphead, inpcblbgroup); typedef uint64_t inp_gen_t; @@ -221,7 +220,6 @@ struct inpcb { short in6p_hops; }; CK_LIST_ENTRY(inpcb) inp_portlist; /* (r:e/w:h) port list */ - struct inpcbport *inp_phd; /* (r:e/w:h) head of this list */ inp_gen_t inp_gencnt; /* (c) generation count */ void *spare_ptr; /* Spare pointer. */ rt_gen_t inp_rt_cookie; /* generation for route entry */ @@ -370,7 +368,7 @@ struct inpcbinfo { /* * Global hash of inpcbs, hashed by only local port number. */ - struct inpcbporthead *ipi_porthashbase; /* (h) */ + struct inpcbhead *ipi_porthashbase; /* (h) */ u_long ipi_porthashmask; /* (h) */ /* @@ -392,11 +390,9 @@ struct inpcbinfo { */ struct inpcbstorage { uma_zone_t ips_zone; - uma_zone_t ips_portzone; uma_init ips_pcbinit; size_t ips_size; const char * ips_zone_name; - const char * ips_portzone_name; const char * ips_infolock_name; const char * ips_hashlock_name; }; @@ -414,7 +410,6 @@ static struct inpcbstorage prot = { \ .ips_size = sizeof(struct ppcb), \ .ips_pcbinit = prot##_inpcb_init, \ .ips_zone_name = zname, \ - .ips_portzone_name = zname " ports", \ .ips_infolock_name = iname, \ .ips_hashlock_name = hname, \ }; \ diff --git a/sys/netinet/in_pcb_var.h b/sys/netinet/in_pcb_var.h index fb88dfec889e..7e8a1626ab40 100644 --- a/sys/netinet/in_pcb_var.h +++ b/sys/netinet/in_pcb_var.h @@ -59,12 +59,6 @@ int in_pcbinshash(struct inpcb *); void in_pcbrehash(struct inpcb *); void in_pcbremhash_locked(struct inpcb *); -struct inpcbport { - struct inpcbhead phd_pcblist; - CK_LIST_ENTRY(inpcbport) phd_hash; - u_short phd_port; -}; - /* * Load balance groups used for the SO_REUSEPORT_LB socket option. Each group * (or unique address:port combination) can be re-used at most diff --git a/sys/netinet6/in6_pcb.c b/sys/netinet6/in6_pcb.c index 64c886ca2ed5..e77a1e9d3e87 100644 --- a/sys/netinet6/in6_pcb.c +++ b/sys/netinet6/in6_pcb.c @@ -767,56 +767,45 @@ in6_pcblookup_local(struct inpcbinfo *pcbinfo, const struct in6_addr *laddr, */ return (NULL); } else { - struct inpcbporthead *porthash; - struct inpcbport *phd; + struct inpcbhead *porthash; struct inpcb *match = NULL; + /* - * Best fit PCB lookup. - * - * First see if this local port is in use by looking on the - * port hash list. + * Port is in use by one or more PCBs. Look for best + * fit. */ porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport, pcbinfo->ipi_porthashmask)]; - CK_LIST_FOREACH(phd, porthash, phd_hash) { - if (phd->phd_port == lport) - break; - } - if (phd != NULL) { - /* - * Port is in use by one or more PCBs. Look for best - * fit. - */ - CK_LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) { - wildcard = 0; - if (!prison_equal_ip6(cred->cr_prison, - inp->inp_cred->cr_prison)) - continue; - /* XXX inp locking */ - if ((inp->inp_vflag & INP_IPV6) == 0) - continue; - if (fib != RT_ALL_FIBS && - inp->inp_inc.inc_fibnum != fib) + CK_LIST_FOREACH(inp, porthash, inp_portlist) { + if (inp->inp_lport != lport) + continue; + if (!prison_equal_ip6(cred->cr_prison, + inp->inp_cred->cr_prison)) + continue; + /* XXX inp locking */ + if ((inp->inp_vflag & INP_IPV6) == 0) + continue; + if (fib != RT_ALL_FIBS && + inp->inp_inc.inc_fibnum != fib) + continue; + wildcard = 0; + if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) + wildcard++; + if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) { + if (IN6_IS_ADDR_UNSPECIFIED(laddr)) + wildcard++; + else if (!IN6_ARE_ADDR_EQUAL( + &inp->in6p_laddr, laddr)) continue; - if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) + } else { + if (!IN6_IS_ADDR_UNSPECIFIED(laddr)) wildcard++; - if (!IN6_IS_ADDR_UNSPECIFIED( - &inp->in6p_laddr)) { - if (IN6_IS_ADDR_UNSPECIFIED(laddr)) - wildcard++; - else if (!IN6_ARE_ADDR_EQUAL( - &inp->in6p_laddr, laddr)) - continue; - } else { - if (!IN6_IS_ADDR_UNSPECIFIED(laddr)) - wildcard++; - } - if (wildcard < matchwild) { - match = inp; - matchwild = wildcard; - if (matchwild == 0) - break; - } + } + if (wildcard < matchwild) { + match = inp; + matchwild = wildcard; + if (matchwild == 0) + break; } } return (match);