On Wed, Dec 08, 2021 at 03:28:34PM -0300, Martin Pieuchot wrote:
> On 04/12/21(Sat) 01:02, Alexander Bluhm wrote:
> > Hi,
> > 
> > As I want a read-only net lock for forwarding, all paths should be
> > checked for the correct net lock and asserts.  I found code in
> > in_pcbhashlookup() where reading the PCB table results in a write
> > to optimize the cache.
> > 
> > Porperly protecting PCB hashes is out of scope for parallel forwarding.
> > Can we get away with this hack?  Only update the cache when we are
> > in TCP oder UDP stack with the write lock.  The access from pf is
> > read-only.
> > 
> > NET_WLOCKED() indicates whether we may change data structures.
> 
> I recall that we currently do not want to introduce such idiom: change
> the behavior of the code depending on which lock is held by the caller.
> 
> Can we instead assert that a write-lock is held before modifying the
> list?

We could also pass down the kind of lock that is used.  Goal is
that pf uses shared net lock.  TCP and UDP will keep the exclusice
net lock for a while.

Diff gets longer but perhaps a bit clearer what is going on.

bluhm

Index: net/pf.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/net/pf.c,v
retrieving revision 1.1122
diff -u -p -r1.1122 pf.c
--- net/pf.c    7 Jul 2021 18:38:25 -0000       1.1122
+++ net/pf.c    8 Dec 2021 21:16:16 -0000
@@ -3317,14 +3317,12 @@ pf_socket_lookup(struct pf_pdesc *pd)
                sport = pd->hdr.tcp.th_sport;
                dport = pd->hdr.tcp.th_dport;
                PF_ASSERT_LOCKED();
-               NET_ASSERT_LOCKED();
                tb = &tcbtable;
                break;
        case IPPROTO_UDP:
                sport = pd->hdr.udp.uh_sport;
                dport = pd->hdr.udp.uh_dport;
                PF_ASSERT_LOCKED();
-               NET_ASSERT_LOCKED();
                tb = &udbtable;
                break;
        default:
@@ -3348,22 +3346,24 @@ pf_socket_lookup(struct pf_pdesc *pd)
                 * Fails when rtable is changed while evaluating the ruleset
                 * The socket looked up will not match the one hit in the end.
                 */
-               inp = in_pcbhashlookup(tb, saddr->v4, sport, daddr->v4, dport,
-                   pd->rdomain);
+               NET_ASSERT_LOCKED();
+               inp = in_pcbhashlookup_wlocked(tb, saddr->v4, sport, daddr->v4,
+                   dport, pd->rdomain, 0);
                if (inp == NULL) {
-                       inp = in_pcblookup_listen(tb, daddr->v4, dport,
-                           NULL, pd->rdomain);
+                       inp = in_pcblookup_listen_wlocked(tb, daddr->v4, dport,
+                           NULL, pd->rdomain, 0);
                        if (inp == NULL)
                                return (-1);
                }
                break;
 #ifdef INET6
        case AF_INET6:
-               inp = in6_pcbhashlookup(tb, &saddr->v6, sport, &daddr->v6,
-                   dport, pd->rdomain);
+               NET_ASSERT_LOCKED();
+               inp = in6_pcbhashlookup_wlocked(tb, &saddr->v6, sport,
+                   &daddr->v6, dport, pd->rdomain, 0);
                if (inp == NULL) {
-                       inp = in6_pcblookup_listen(tb, &daddr->v6, dport,
-                           NULL, pd->rdomain);
+                       inp = in6_pcblookup_listen_wlocked(tb, &daddr->v6,
+                           dport, NULL, pd->rdomain, 0);
                        if (inp == NULL)
                                return (-1);
                }
Index: netinet/in_pcb.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.c,v
retrieving revision 1.256
diff -u -p -r1.256 in_pcb.c
--- netinet/in_pcb.c    25 Oct 2021 22:20:47 -0000      1.256
+++ netinet/in_pcb.c    8 Dec 2021 21:16:16 -0000
@@ -1051,6 +1051,15 @@ in_pcbresize(struct inpcbtable *table, i
 int    in_pcbnotifymiss = 0;
 #endif
 
+struct inpcb *
+in_pcbhashlookup(struct inpcbtable *table, struct in_addr faddr,
+    u_int fport_arg, struct in_addr laddr, u_int lport_arg, u_int rtable)
+{
+       NET_ASSERT_WLOCKED();
+       return in_pcbhashlookup_wlocked(table, faddr ,fport_arg, laddr,
+           lport_arg, rtable, 1);
+}
+
 /*
  * The in(6)_pcbhashlookup functions are used to locate connected sockets
  * quickly:
@@ -1061,8 +1070,9 @@ int       in_pcbnotifymiss = 0;
  * After those two lookups no other are necessary.
  */
 struct inpcb *
-in_pcbhashlookup(struct inpcbtable *table, struct in_addr faddr,
-    u_int fport_arg, struct in_addr laddr, u_int lport_arg, u_int rtable)
+in_pcbhashlookup_wlocked(struct inpcbtable *table, struct in_addr faddr,
+    u_int fport_arg, struct in_addr laddr, u_int lport_arg, u_int rtable,
+    int wlocked)
 {
        struct inpcbhead *head;
        struct inpcb *inp;
@@ -1093,7 +1103,7 @@ in_pcbhashlookup(struct inpcbtable *tabl
                }
        }
 #ifdef DIAGNOSTIC
-       if (inp == NULL && in_pcbnotifymiss) {
+       if (wlocked && inp == NULL && in_pcbnotifymiss) {
                printf("%s: faddr=%08x fport=%d laddr=%08x lport=%d rdom=%u\n",
                    __func__, ntohl(faddr.s_addr), ntohs(fport),
                    ntohl(laddr.s_addr), ntohs(lport), rdomain);
@@ -1102,6 +1112,15 @@ in_pcbhashlookup(struct inpcbtable *tabl
        return (inp);
 }
 
+struct inpcb *
+in_pcblookup_listen(struct inpcbtable *table, struct in_addr laddr,
+    u_int lport_arg, struct mbuf *m, u_int rtable)
+{
+       NET_ASSERT_WLOCKED();
+       return in_pcblookup_listen_wlocked(table, laddr, lport_arg, m, rtable,
+           1);
+}
+
 /*
  * The in(6)_pcblookup_listen functions are used to locate listening
  * sockets quickly.  This are sockets with unspecified foreign address
@@ -1110,8 +1129,8 @@ in_pcbhashlookup(struct inpcbtable *tabl
  *             *.*     <->     *.lport
  */
 struct inpcb *
-in_pcblookup_listen(struct inpcbtable *table, struct in_addr laddr,
-    u_int lport_arg, struct mbuf *m, u_int rtable)
+in_pcblookup_listen_wlocked(struct inpcbtable *table, struct in_addr laddr,
+    u_int lport_arg, struct mbuf *m, u_int rtable, int wlocked)
 {
        struct inpcbhead *head;
        const struct in_addr *key1, *key2;
@@ -1185,7 +1204,7 @@ in_pcblookup_listen(struct inpcbtable *t
         * repeated accesses are quicker.  This is analogous to
         * the historic single-entry PCB cache.
         */
-       if (inp != NULL && inp != LIST_FIRST(head)) {
+       if (wlocked && inp != NULL && inp != LIST_FIRST(head)) {
                LIST_REMOVE(inp, inp_hash);
                LIST_INSERT_HEAD(head, inp, inp_hash);
        }
Index: netinet/in_pcb.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.h,v
retrieving revision 1.121
diff -u -p -r1.121 in_pcb.h
--- netinet/in_pcb.h    25 Jan 2021 03:40:46 -0000      1.121
+++ netinet/in_pcb.h    8 Dec 2021 21:16:16 -0000
@@ -274,20 +274,32 @@ void       in_pcbunref(struct inpcb *);
 void    in_pcbdisconnect(struct inpcb *);
 struct inpcb *
         in_pcbhashlookup(struct inpcbtable *, struct in_addr,
-                              u_int, struct in_addr, u_int, u_int);
+           u_int, struct in_addr, u_int, u_int);
 struct inpcb *
         in_pcblookup_listen(struct inpcbtable *, struct in_addr, u_int,
            struct mbuf *, u_int);
+struct inpcb *
+        in_pcbhashlookup_wlocked(struct inpcbtable *, struct in_addr,
+          u_int, struct in_addr, u_int, u_int, int);
+struct inpcb *
+        in_pcblookup_listen_wlocked(struct inpcbtable *, struct in_addr, u_int,
+           struct mbuf *, u_int, int);
 #ifdef INET6
 struct inpcbhead *
         in6_pcbhash(struct inpcbtable *, int, const struct in6_addr *,
            u_short, const struct in6_addr *, u_short);
 struct inpcb *
         in6_pcbhashlookup(struct inpcbtable *, const struct in6_addr *,
-                              u_int, const struct in6_addr *, u_int, u_int);
+           u_int, const struct in6_addr *, u_int, u_int);
 struct inpcb *
         in6_pcblookup_listen(struct inpcbtable *, struct in6_addr *, u_int,
            struct mbuf *, u_int);
+struct inpcb *
+        in6_pcbhashlookup_wlocked(struct inpcbtable *, const struct in6_addr *,
+           u_int, const struct in6_addr *, u_int, u_int, int);
+struct inpcb *
+        in6_pcblookup_listen_wlocked(struct inpcbtable *, struct in6_addr *,
+           u_int, struct mbuf *, u_int, int);
 int     in6_pcbaddrisavail(struct inpcb *, struct sockaddr_in6 *, int,
            struct proc *);
 int     in6_pcbconnect(struct inpcb *, struct mbuf *);
Index: netinet6/in6_pcb.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/in6_pcb.c,v
retrieving revision 1.112
diff -u -p -r1.112 in6_pcb.c
--- netinet6/in6_pcb.c  11 Feb 2021 10:41:19 -0000      1.112
+++ netinet6/in6_pcb.c  8 Dec 2021 21:16:16 -0000
@@ -500,6 +500,16 @@ in6_pcbhashlookup(struct inpcbtable *tab
     u_int fport_arg, const struct in6_addr *laddr, u_int lport_arg,
     u_int rtable)
 {
+       NET_ASSERT_WLOCKED();
+       return in6_pcbhashlookup_wlocked(table, faddr, fport_arg,
+           laddr, lport_arg, rtable, 1);
+}
+
+struct inpcb *
+in6_pcbhashlookup_wlocked(struct inpcbtable *table,
+    const struct in6_addr *faddr, u_int fport_arg,
+    const struct in6_addr *laddr, u_int lport_arg, u_int rtable, int wlocked)
+{
        struct inpcbhead *head;
        struct inpcb *inp;
        u_int16_t fport = fport_arg, lport = lport_arg;
@@ -539,6 +549,14 @@ struct inpcb *
 in6_pcblookup_listen(struct inpcbtable *table, struct in6_addr *laddr,
     u_int lport_arg, struct mbuf *m, u_int rtable)
 {
+       return in6_pcblookup_listen_wlocked(table, laddr, lport_arg, m, rtable,
+           1);
+}
+
+struct inpcb *
+in6_pcblookup_listen_wlocked(struct inpcbtable *table, struct in6_addr *laddr,
+    u_int lport_arg, struct mbuf *m, u_int rtable, int wlocked)
+{
        struct inpcbhead *head;
        const struct in6_addr *key1, *key2;
        struct inpcb *inp;
@@ -604,7 +622,7 @@ in6_pcblookup_listen(struct inpcbtable *
         * repeated accesses are quicker.  This is analogous to
         * the historic single-entry PCB cache.
         */
-       if (inp != NULL && inp != LIST_FIRST(head)) {
+       if (wlocked && inp != NULL && inp != LIST_FIRST(head)) {
                LIST_REMOVE(inp, inp_hash);
                LIST_INSERT_HEAD(head, inp, inp_hash);
        }

Reply via email to