The branch main has been updated by glebius:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=08d9c9202755a30f97617758595214a530afcaea

commit 08d9c9202755a30f97617758595214a530afcaea
Author:     Gleb Smirnoff <gleb...@freebsd.org>
AuthorDate: 2021-03-19 02:06:13 +0000
Commit:     Gleb Smirnoff <gleb...@freebsd.org>
CommitDate: 2021-04-12 15:25:31 +0000

    tcp_input/syncache: acquire only read lock on PCB for SYN,!ACK packets
    
    When packet is a SYN packet, we don't need to modify any existing PCB.
    Normally SYN arrives on a listening socket, we either create a syncache
    entry or generate syncookie, but we don't modify anything with the
    listening socket or associated PCB. Thus create a new PCB lookup
    mode - rlock if listening. This removes the primary contention point
    under SYN flood - the listening socket PCB.
    
    Sidenote: when SYN arrives on a synchronized connection, we still
    don't need write access to PCB to send a challenge ACK or just to
    drop. There is only one exclusion - tcptw recycling. However,
    existing entanglement of tcp_input + stacks doesn't allow to make
    this change small. Consider this patch as first approach to the problem.
    
    Reviewed by:    rrs
    Differential revision:  https://reviews.freebsd.org/D29576
---
 sys/netinet/in_pcb.c        | 35 ++++++++++++++---------------------
 sys/netinet/in_pcb.h        |  3 ++-
 sys/netinet/tcp_input.c     | 42 ++++++++++++++++++++++++------------------
 sys/netinet/tcp_subr.c      |  2 +-
 sys/netinet/tcp_syncache.c  | 10 +++++-----
 sys/netinet6/in6_pcb.c      | 34 ++++++++++++++--------------------
 sys/security/mac/mac_inet.c |  2 +-
 7 files changed, 61 insertions(+), 67 deletions(-)

diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c
index 40a0b4c0676e..43cd469e0fcf 100644
--- a/sys/netinet/in_pcb.c
+++ b/sys/netinet/in_pcb.c
@@ -2518,31 +2518,24 @@ in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct 
in_addr faddr,
        struct inpcb *inp;
 
        inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport,
-           (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp,
-           numa_domain);
+           lookupflags & INPLOOKUP_WILDCARD, ifp, numa_domain);
        if (inp != NULL) {
                if (lookupflags & INPLOOKUP_WLOCKPCB) {
                        INP_WLOCK(inp);
-                       if (__predict_false(inp->inp_flags2 & INP_FREED)) {
-                               INP_WUNLOCK(inp);
-                               inp = NULL;
-                       }
                } else if (lookupflags & INPLOOKUP_RLOCKPCB) {
                        INP_RLOCK(inp);
-                       if (__predict_false(inp->inp_flags2 & INP_FREED)) {
-                               INP_RUNLOCK(inp);
-                               inp = NULL;
-                       }
+               } else if (lookupflags & INPLOOKUP_RLOCKLISTEN) {
+                       if (inp->inp_socket != NULL &&
+                           SOLISTENING(inp->inp_socket))
+                               INP_RLOCK(inp);
+                       else
+                               INP_WLOCK(inp);
                } else
                        panic("%s: locking bug", __func__);
-#ifdef INVARIANTS
-               if (inp != NULL) {
-                       if (lookupflags & INPLOOKUP_WLOCKPCB)
-                               INP_WLOCK_ASSERT(inp);
-                       else
-                               INP_RLOCK_ASSERT(inp);
+               if (__predict_false(inp->inp_flags2 & INP_FREED)) {
+                       INP_UNLOCK(inp);
+                       inp = NULL;
                }
-#endif
        }
 
        return (inp);
@@ -2564,8 +2557,8 @@ in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr 
faddr, u_int fport,
 
        KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
            ("%s: invalid lookup flags %d", __func__, lookupflags));
-       KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
-           ("%s: LOCKPCB not set", __func__));
+       KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB |
+           INPLOOKUP_RLOCKLISTEN)) != 0, ("%s: LOCKPCB not set", __func__));
 
        /*
         * When not using RSS, use connection groups in preference to the
@@ -2600,8 +2593,8 @@ in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct 
in_addr faddr,
 
        KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
            ("%s: invalid lookup flags %d", __func__, lookupflags));
-       KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
-           ("%s: LOCKPCB not set", __func__));
+       KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB |
+           INPLOOKUP_RLOCKLISTEN)) != 0, ("%s: LOCKPCB not set", __func__));
 
 #ifdef PCBGROUP
        /*
diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h
index 9604a837cfb4..4959fc238dfb 100644
--- a/sys/netinet/in_pcb.h
+++ b/sys/netinet/in_pcb.h
@@ -761,9 +761,10 @@ int        inp_so_options(const struct inpcb *inp);
 #define        INPLOOKUP_WILDCARD      0x00000001      /* Allow wildcard 
sockets. */
 #define        INPLOOKUP_RLOCKPCB      0x00000002      /* Return inpcb 
read-locked. */
 #define        INPLOOKUP_WLOCKPCB      0x00000004      /* Return inpcb 
write-locked. */
+#define        INPLOOKUP_RLOCKLISTEN   0x00000008      /* Rlock if listening, 
W if !.*/
 
 #define        INPLOOKUP_MASK  (INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB | \
-                           INPLOOKUP_WLOCKPCB)
+           INPLOOKUP_WLOCKPCB | INPLOOKUP_RLOCKLISTEN)
 
 #define        sotoinpcb(so)   ((struct inpcb *)(so)->so_pcb)
 
diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
index e53296670a0f..f69262230a05 100644
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@@ -626,6 +626,7 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
        int drop_hdrlen;
        int thflags;
        int rstreason = 0;      /* For badport_bandlim accounting purposes */
+       int lookupflag;
        uint8_t iptos;
        struct m_tag *fwd_tag = NULL;
 #ifdef INET6
@@ -825,6 +826,12 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
            )
                fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
 
+       /*
+        * For initial SYN packets arriving on listening socket,
+        * we don't need write lock.
+        */
+       lookupflag = (thflags & (TH_ACK|TH_SYN)) == TH_SYN ?
+           INPLOOKUP_RLOCKLISTEN : INPLOOKUP_WLOCKPCB;
 findpcb:
 #ifdef INET6
        if (isipv6 && fwd_tag != NULL) {
@@ -837,7 +844,7 @@ findpcb:
                 */
                inp = in6_pcblookup_mbuf(&V_tcbinfo,
                    &ip6->ip6_src, th->th_sport, &ip6->ip6_dst, th->th_dport,
-                   INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif, m);
+                   lookupflag, m->m_pkthdr.rcvif, m);
                if (!inp) {
                        /*
                         * It's new.  Try to find the ambushing socket.
@@ -847,14 +854,13 @@ findpcb:
                        inp = in6_pcblookup(&V_tcbinfo, &ip6->ip6_src,
                            th->th_sport, &next_hop6->sin6_addr,
                            next_hop6->sin6_port ? ntohs(next_hop6->sin6_port) :
-                           th->th_dport, INPLOOKUP_WILDCARD |
-                           INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif);
+                           th->th_dport, INPLOOKUP_WILDCARD | lookupflag,
+                           m->m_pkthdr.rcvif);
                }
        } else if (isipv6) {
                inp = in6_pcblookup_mbuf(&V_tcbinfo, &ip6->ip6_src,
                    th->th_sport, &ip6->ip6_dst, th->th_dport,
-                   INPLOOKUP_WILDCARD | INPLOOKUP_WLOCKPCB,
-                   m->m_pkthdr.rcvif, m);
+                   INPLOOKUP_WILDCARD | lookupflag, m->m_pkthdr.rcvif, m);
        }
 #endif /* INET6 */
 #if defined(INET6) && defined(INET)
@@ -870,8 +876,7 @@ findpcb:
                 * already got one like this?
                 */
                inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src, th->th_sport,
-                   ip->ip_dst, th->th_dport, INPLOOKUP_WLOCKPCB,
-                   m->m_pkthdr.rcvif, m);
+                   ip->ip_dst, th->th_dport, lookupflag, m->m_pkthdr.rcvif, m);
                if (!inp) {
                        /*
                         * It's new.  Try to find the ambushing socket.
@@ -881,14 +886,13 @@ findpcb:
                        inp = in_pcblookup(&V_tcbinfo, ip->ip_src,
                            th->th_sport, next_hop->sin_addr,
                            next_hop->sin_port ? ntohs(next_hop->sin_port) :
-                           th->th_dport, INPLOOKUP_WILDCARD |
-                           INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif);
+                           th->th_dport, INPLOOKUP_WILDCARD | lookupflag,
+                           m->m_pkthdr.rcvif);
                }
        } else
                inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src,
                    th->th_sport, ip->ip_dst, th->th_dport,
-                   INPLOOKUP_WILDCARD | INPLOOKUP_WLOCKPCB,
-                   m->m_pkthdr.rcvif, m);
+                   INPLOOKUP_WILDCARD | lookupflag, m->m_pkthdr.rcvif, m);
 #endif /* INET */
 
        /*
@@ -918,14 +922,14 @@ findpcb:
                rstreason = BANDLIM_RST_CLOSEDPORT;
                goto dropwithreset;
        }
-       INP_WLOCK_ASSERT(inp);
+       INP_LOCK_ASSERT(inp);
        /*
         * While waiting for inp lock during the lookup, another thread
         * can have dropped the inpcb, in which case we need to loop back
         * and try to find a new inpcb to deliver to.
         */
        if (inp->inp_flags & INP_DROPPED) {
-               INP_WUNLOCK(inp);
+               INP_UNLOCK(inp);
                inp = NULL;
                goto findpcb;
        }
@@ -1014,7 +1018,6 @@ findpcb:
 #endif
 
 #ifdef MAC
-       INP_WLOCK_ASSERT(inp);
        if (mac_inpcb_check_deliver(inp, m))
                goto dropunlock;
 #endif
@@ -1124,8 +1127,10 @@ tfo_socket_result:
                         * Socket is created in state SYN_RECEIVED.
                         * Unlock the listen socket, lock the newly
                         * created socket and update the tp variable.
+                        * If we came here via jump to tfo_socket_result,
+                        * then listening socket is read-locked.
                         */
-                       INP_WUNLOCK(inp);       /* listen socket */
+                       INP_UNLOCK(inp);        /* listen socket */
                        inp = sotoinpcb(so);
                        /*
                         * New connection inpcb is already locked by
@@ -1213,6 +1218,7 @@ tfo_socket_result:
                    ("%s: Listen socket: TH_RST or TH_ACK set", __func__));
                KASSERT(thflags & (TH_SYN),
                    ("%s: Listen socket: TH_SYN not set", __func__));
+               INP_RLOCK_ASSERT(inp);
 #ifdef INET6
                /*
                 * If deprecated address is forbidden,
@@ -1381,7 +1387,7 @@ dropwithreset:
 
        if (inp != NULL) {
                tcp_dropwithreset(m, th, tp, tlen, rstreason);
-               INP_WUNLOCK(inp);
+               INP_UNLOCK(inp);
        } else
                tcp_dropwithreset(m, th, NULL, tlen, rstreason);
        m = NULL;       /* mbuf chain got consumed. */
@@ -1392,7 +1398,7 @@ dropunlock:
                TCP_PROBE5(receive, NULL, tp, m, tp, th);
 
        if (inp != NULL)
-               INP_WUNLOCK(inp);
+               INP_UNLOCK(inp);
 
 drop:
        INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo);
@@ -3360,7 +3366,7 @@ tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, 
struct tcpcb *tp,
 #endif
 
        if (tp != NULL) {
-               INP_WLOCK_ASSERT(tp->t_inpcb);
+               INP_LOCK_ASSERT(tp->t_inpcb);
        }
 
        /* Don't bother if destination was broadcast/multicast. */
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
index 4ed7e16f3557..05af0076b23a 100644
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@@ -1428,7 +1428,7 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr 
*th, struct mbuf *m,
        if (tp != NULL) {
                inp = tp->t_inpcb;
                KASSERT(inp != NULL, ("tcp control block w/o inpcb"));
-               INP_WLOCK_ASSERT(inp);
+               INP_LOCK_ASSERT(inp);
        } else
                inp = NULL;
 
diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c
index 771ff44b8924..b1a0c1f7e229 100644
--- a/sys/netinet/tcp_syncache.c
+++ b/sys/netinet/tcp_syncache.c
@@ -1397,7 +1397,7 @@ syncache_add(struct in_conninfo *inc, struct tcpopt *to, 
struct tcphdr *th,
        int tfo_response_cookie_valid = 0;
        bool locked;
 
-       INP_WLOCK_ASSERT(inp);                  /* listen socket */
+       INP_RLOCK_ASSERT(inp);                  /* listen socket */
        KASSERT((th->th_flags & (TH_RST|TH_ACK|TH_SYN)) == TH_SYN,
            ("%s: unexpected tcp flags", __func__));
 
@@ -1469,13 +1469,13 @@ syncache_add(struct in_conninfo *inc, struct tcpopt 
*to, struct tcphdr *th,
 
 #ifdef MAC
        if (mac_syncache_init(&maclabel) != 0) {
-               INP_WUNLOCK(inp);
+               INP_RUNLOCK(inp);
                goto done;
        } else
                mac_syncache_create(maclabel, inp);
 #endif
        if (!tfo_cookie_valid)
-               INP_WUNLOCK(inp);
+               INP_RUNLOCK(inp);
 
        /*
         * Remember the IP options, if any.
@@ -1528,7 +1528,7 @@ syncache_add(struct in_conninfo *inc, struct tcpopt *to, 
struct tcphdr *th,
        }
        if (sc != NULL) {
                if (tfo_cookie_valid)
-                       INP_WUNLOCK(inp);
+                       INP_RUNLOCK(inp);
                TCPSTAT_INC(tcps_sc_dupsyn);
                if (ipopts) {
                        /*
@@ -1735,7 +1735,7 @@ skip_alloc:
 
        if (tfo_cookie_valid) {
                syncache_tfo_expand(sc, lsop, m, tfo_response_cookie);
-               /* INP_WUNLOCK(inp) will be performed by the caller */
+               /* INP_RUNLOCK(inp) will be performed by the caller */
                rv = 1;
                goto tfo_expanded;
        }
diff --git a/sys/netinet6/in6_pcb.c b/sys/netinet6/in6_pcb.c
index 5fce9fcafa33..96be795d5757 100644
--- a/sys/netinet6/in6_pcb.c
+++ b/sys/netinet6/in6_pcb.c
@@ -1293,31 +1293,24 @@ in6_pcblookup_hash(struct inpcbinfo *pcbinfo, struct 
in6_addr *faddr,
        struct inpcb *inp;
 
        inp = in6_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport,
-           (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp,
-           numa_domain);
+           lookupflags & INPLOOKUP_WILDCARD, ifp, numa_domain);
        if (inp != NULL) {
                if (lookupflags & INPLOOKUP_WLOCKPCB) {
                        INP_WLOCK(inp);
-                       if (__predict_false(inp->inp_flags2 & INP_FREED)) {
-                               INP_WUNLOCK(inp);
-                               inp = NULL;
-                       }
                } else if (lookupflags & INPLOOKUP_RLOCKPCB) {
                        INP_RLOCK(inp);
-                       if (__predict_false(inp->inp_flags2 & INP_FREED)) {
-                               INP_RUNLOCK(inp);
-                               inp = NULL;
-                       }
+               } else if (lookupflags & INPLOOKUP_RLOCKLISTEN) {
+                       if (inp->inp_socket != NULL &&
+                           SOLISTENING(inp->inp_socket))
+                               INP_RLOCK(inp);
+                       else
+                               INP_WLOCK(inp);
                } else
                        panic("%s: locking bug", __func__);
-#ifdef INVARIANTS
-               if (inp != NULL) {
-                       if (lookupflags & INPLOOKUP_WLOCKPCB)
-                               INP_WLOCK_ASSERT(inp);
-                       else
-                               INP_RLOCK_ASSERT(inp);
+               if (__predict_false(inp->inp_flags2 & INP_FREED)) {
+                       INP_UNLOCK(inp);
+                       inp = NULL;
                }
-#endif
        }
        return (inp);
 }
@@ -1338,8 +1331,8 @@ in6_pcblookup(struct inpcbinfo *pcbinfo, struct in6_addr 
*faddr, u_int fport,
 
        KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
            ("%s: invalid lookup flags %d", __func__, lookupflags));
-       KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
-           ("%s: LOCKPCB not set", __func__));
+       KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB |
+           INPLOOKUP_RLOCKLISTEN)) != 0, ("%s: LOCKPCB not set", __func__));
 
        /*
         * When not using RSS, use connection groups in preference to the
@@ -1374,7 +1367,8 @@ in6_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct 
in6_addr *faddr,
 
        KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
            ("%s: invalid lookup flags %d", __func__, lookupflags));
-       KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
+       KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB |
+           INPLOOKUP_RLOCKLISTEN)) != 0,
            ("%s: LOCKPCB not set", __func__));
 
 #ifdef PCBGROUP
diff --git a/sys/security/mac/mac_inet.c b/sys/security/mac/mac_inet.c
index 97f256c118fc..2b6a70fdf1bf 100644
--- a/sys/security/mac/mac_inet.c
+++ b/sys/security/mac/mac_inet.c
@@ -487,7 +487,7 @@ void
 mac_syncache_create(struct label *label, struct inpcb *inp)
 {
 
-       INP_WLOCK_ASSERT(inp);
+       INP_LOCK_ASSERT(inp);
 
        MAC_POLICY_PERFORM_NOSLEEP(syncache_create, label, inp);
 }
_______________________________________________
dev-commits-src-main@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/dev-commits-src-main
To unsubscribe, send any mail to "dev-commits-src-main-unsubscr...@freebsd.org"

Reply via email to