On 08/12/21(Wed) 22:39, Alexander Bluhm wrote:
> On Wed, Dec 08, 2021 at 03:28:34PM -0300, Martin Pieuchot wrote:
> > On 04/12/21(Sat) 01:02, Alexander Bluhm wrote:
> > > Hi,
> > > 
> > > As I want a read-only net lock for forwarding, all paths should be
> > > checked for the correct net lock and asserts.  I found code in
> > > in_pcbhashlookup() where reading the PCB table results in a write
> > > to optimize the cache.
> > > 
> > > Porperly protecting PCB hashes is out of scope for parallel forwarding.
> > > Can we get away with this hack?  Only update the cache when we are
> > > in TCP oder UDP stack with the write lock.  The access from pf is
> > > read-only.
> > > 
> > > NET_WLOCKED() indicates whether we may change data structures.
> > 
> > I recall that we currently do not want to introduce such idiom: change
> > the behavior of the code depending on which lock is held by the caller.
> > 
> > Can we instead assert that a write-lock is held before modifying the
> > list?
> 
> We could also pass down the kind of lock that is used.  Goal is
> that pf uses shared net lock.  TCP and UDP will keep the exclusice
> net lock for a while.

Changing the logic of a function based on the type of a lock is not
different from the previous approach.

> Diff gets longer but perhaps a bit clearer what is going on.

I believe we want to split in_pcblookup_listen() into two functions.
One which is read-only and one which modifies the head of the hash
chain.
The read-only asserts for any lock and the one that modifies the hash
calls the former and assert for a write lock.

Alternatively, we could protect the PCB hash with a mutex.  This has the
advantage of not making the scope of the NET_LOCK() more complex.  In
the end we all know something like that will be done.  I don't know how
other BSD did this but I'm sure this will help getting the remaining
socket layer out of the NET_LOCK().

> Index: net/pf.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/net/pf.c,v
> retrieving revision 1.1122
> diff -u -p -r1.1122 pf.c
> --- net/pf.c  7 Jul 2021 18:38:25 -0000       1.1122
> +++ net/pf.c  8 Dec 2021 21:16:16 -0000
> @@ -3317,14 +3317,12 @@ pf_socket_lookup(struct pf_pdesc *pd)
>               sport = pd->hdr.tcp.th_sport;
>               dport = pd->hdr.tcp.th_dport;
>               PF_ASSERT_LOCKED();
> -             NET_ASSERT_LOCKED();
>               tb = &tcbtable;
>               break;
>       case IPPROTO_UDP:
>               sport = pd->hdr.udp.uh_sport;
>               dport = pd->hdr.udp.uh_dport;
>               PF_ASSERT_LOCKED();
> -             NET_ASSERT_LOCKED();
>               tb = &udbtable;
>               break;
>       default:
> @@ -3348,22 +3346,24 @@ pf_socket_lookup(struct pf_pdesc *pd)
>                * Fails when rtable is changed while evaluating the ruleset
>                * The socket looked up will not match the one hit in the end.
>                */
> -             inp = in_pcbhashlookup(tb, saddr->v4, sport, daddr->v4, dport,
> -                 pd->rdomain);
> +             NET_ASSERT_LOCKED();
> +             inp = in_pcbhashlookup_wlocked(tb, saddr->v4, sport, daddr->v4,
> +                 dport, pd->rdomain, 0);
>               if (inp == NULL) {
> -                     inp = in_pcblookup_listen(tb, daddr->v4, dport,
> -                         NULL, pd->rdomain);
> +                     inp = in_pcblookup_listen_wlocked(tb, daddr->v4, dport,
> +                         NULL, pd->rdomain, 0);
>                       if (inp == NULL)
>                               return (-1);
>               }
>               break;
>  #ifdef INET6
>       case AF_INET6:
> -             inp = in6_pcbhashlookup(tb, &saddr->v6, sport, &daddr->v6,
> -                 dport, pd->rdomain);
> +             NET_ASSERT_LOCKED();
> +             inp = in6_pcbhashlookup_wlocked(tb, &saddr->v6, sport,
> +                 &daddr->v6, dport, pd->rdomain, 0);
>               if (inp == NULL) {
> -                     inp = in6_pcblookup_listen(tb, &daddr->v6, dport,
> -                         NULL, pd->rdomain);
> +                     inp = in6_pcblookup_listen_wlocked(tb, &daddr->v6,
> +                         dport, NULL, pd->rdomain, 0);
>                       if (inp == NULL)
>                               return (-1);
>               }
> Index: netinet/in_pcb.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.c,v
> retrieving revision 1.256
> diff -u -p -r1.256 in_pcb.c
> --- netinet/in_pcb.c  25 Oct 2021 22:20:47 -0000      1.256
> +++ netinet/in_pcb.c  8 Dec 2021 21:16:16 -0000
> @@ -1051,6 +1051,15 @@ in_pcbresize(struct inpcbtable *table, i
>  int  in_pcbnotifymiss = 0;
>  #endif
>  
> +struct inpcb *
> +in_pcbhashlookup(struct inpcbtable *table, struct in_addr faddr,
> +    u_int fport_arg, struct in_addr laddr, u_int lport_arg, u_int rtable)
> +{
> +     NET_ASSERT_WLOCKED();
> +     return in_pcbhashlookup_wlocked(table, faddr ,fport_arg, laddr,
> +         lport_arg, rtable, 1);
> +}
> +
>  /*
>   * The in(6)_pcbhashlookup functions are used to locate connected sockets
>   * quickly:
> @@ -1061,8 +1070,9 @@ int     in_pcbnotifymiss = 0;
>   * After those two lookups no other are necessary.
>   */
>  struct inpcb *
> -in_pcbhashlookup(struct inpcbtable *table, struct in_addr faddr,
> -    u_int fport_arg, struct in_addr laddr, u_int lport_arg, u_int rtable)
> +in_pcbhashlookup_wlocked(struct inpcbtable *table, struct in_addr faddr,
> +    u_int fport_arg, struct in_addr laddr, u_int lport_arg, u_int rtable,
> +    int wlocked)
>  {
>       struct inpcbhead *head;
>       struct inpcb *inp;
> @@ -1093,7 +1103,7 @@ in_pcbhashlookup(struct inpcbtable *tabl
>               }
>       }
>  #ifdef DIAGNOSTIC
> -     if (inp == NULL && in_pcbnotifymiss) {
> +     if (wlocked && inp == NULL && in_pcbnotifymiss) {
>               printf("%s: faddr=%08x fport=%d laddr=%08x lport=%d rdom=%u\n",
>                   __func__, ntohl(faddr.s_addr), ntohs(fport),
>                   ntohl(laddr.s_addr), ntohs(lport), rdomain);
> @@ -1102,6 +1112,15 @@ in_pcbhashlookup(struct inpcbtable *tabl
>       return (inp);
>  }
>  
> +struct inpcb *
> +in_pcblookup_listen(struct inpcbtable *table, struct in_addr laddr,
> +    u_int lport_arg, struct mbuf *m, u_int rtable)
> +{
> +     NET_ASSERT_WLOCKED();
> +     return in_pcblookup_listen_wlocked(table, laddr, lport_arg, m, rtable,
> +         1);
> +}
> +
>  /*
>   * The in(6)_pcblookup_listen functions are used to locate listening
>   * sockets quickly.  This are sockets with unspecified foreign address
> @@ -1110,8 +1129,8 @@ in_pcbhashlookup(struct inpcbtable *tabl
>   *           *.*     <->     *.lport
>   */
>  struct inpcb *
> -in_pcblookup_listen(struct inpcbtable *table, struct in_addr laddr,
> -    u_int lport_arg, struct mbuf *m, u_int rtable)
> +in_pcblookup_listen_wlocked(struct inpcbtable *table, struct in_addr laddr,
> +    u_int lport_arg, struct mbuf *m, u_int rtable, int wlocked)
>  {
>       struct inpcbhead *head;
>       const struct in_addr *key1, *key2;
> @@ -1185,7 +1204,7 @@ in_pcblookup_listen(struct inpcbtable *t
>        * repeated accesses are quicker.  This is analogous to
>        * the historic single-entry PCB cache.
>        */
> -     if (inp != NULL && inp != LIST_FIRST(head)) {
> +     if (wlocked && inp != NULL && inp != LIST_FIRST(head)) {
>               LIST_REMOVE(inp, inp_hash);
>               LIST_INSERT_HEAD(head, inp, inp_hash);
>       }
> Index: netinet/in_pcb.h
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/in_pcb.h,v
> retrieving revision 1.121
> diff -u -p -r1.121 in_pcb.h
> --- netinet/in_pcb.h  25 Jan 2021 03:40:46 -0000      1.121
> +++ netinet/in_pcb.h  8 Dec 2021 21:16:16 -0000
> @@ -274,20 +274,32 @@ void     in_pcbunref(struct inpcb *);
>  void  in_pcbdisconnect(struct inpcb *);
>  struct inpcb *
>        in_pcbhashlookup(struct inpcbtable *, struct in_addr,
> -                            u_int, struct in_addr, u_int, u_int);
> +         u_int, struct in_addr, u_int, u_int);
>  struct inpcb *
>        in_pcblookup_listen(struct inpcbtable *, struct in_addr, u_int,
>           struct mbuf *, u_int);
> +struct inpcb *
> +      in_pcbhashlookup_wlocked(struct inpcbtable *, struct in_addr,
> +        u_int, struct in_addr, u_int, u_int, int);
> +struct inpcb *
> +      in_pcblookup_listen_wlocked(struct inpcbtable *, struct in_addr, u_int,
> +         struct mbuf *, u_int, int);
>  #ifdef INET6
>  struct inpcbhead *
>        in6_pcbhash(struct inpcbtable *, int, const struct in6_addr *,
>           u_short, const struct in6_addr *, u_short);
>  struct inpcb *
>        in6_pcbhashlookup(struct inpcbtable *, const struct in6_addr *,
> -                            u_int, const struct in6_addr *, u_int, u_int);
> +         u_int, const struct in6_addr *, u_int, u_int);
>  struct inpcb *
>        in6_pcblookup_listen(struct inpcbtable *, struct in6_addr *, u_int,
>           struct mbuf *, u_int);
> +struct inpcb *
> +      in6_pcbhashlookup_wlocked(struct inpcbtable *, const struct in6_addr *,
> +         u_int, const struct in6_addr *, u_int, u_int, int);
> +struct inpcb *
> +      in6_pcblookup_listen_wlocked(struct inpcbtable *, struct in6_addr *,
> +         u_int, struct mbuf *, u_int, int);
>  int   in6_pcbaddrisavail(struct inpcb *, struct sockaddr_in6 *, int,
>           struct proc *);
>  int   in6_pcbconnect(struct inpcb *, struct mbuf *);
> Index: netinet6/in6_pcb.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/in6_pcb.c,v
> retrieving revision 1.112
> diff -u -p -r1.112 in6_pcb.c
> --- netinet6/in6_pcb.c        11 Feb 2021 10:41:19 -0000      1.112
> +++ netinet6/in6_pcb.c        8 Dec 2021 21:16:16 -0000
> @@ -500,6 +500,16 @@ in6_pcbhashlookup(struct inpcbtable *tab
>      u_int fport_arg, const struct in6_addr *laddr, u_int lport_arg,
>      u_int rtable)
>  {
> +     NET_ASSERT_WLOCKED();
> +     return in6_pcbhashlookup_wlocked(table, faddr, fport_arg,
> +         laddr, lport_arg, rtable, 1);
> +}
> +
> +struct inpcb *
> +in6_pcbhashlookup_wlocked(struct inpcbtable *table,
> +    const struct in6_addr *faddr, u_int fport_arg,
> +    const struct in6_addr *laddr, u_int lport_arg, u_int rtable, int wlocked)
> +{
>       struct inpcbhead *head;
>       struct inpcb *inp;
>       u_int16_t fport = fport_arg, lport = lport_arg;
> @@ -539,6 +549,14 @@ struct inpcb *
>  in6_pcblookup_listen(struct inpcbtable *table, struct in6_addr *laddr,
>      u_int lport_arg, struct mbuf *m, u_int rtable)
>  {
> +     return in6_pcblookup_listen_wlocked(table, laddr, lport_arg, m, rtable,
> +         1);
> +}
> +
> +struct inpcb *
> +in6_pcblookup_listen_wlocked(struct inpcbtable *table, struct in6_addr 
> *laddr,
> +    u_int lport_arg, struct mbuf *m, u_int rtable, int wlocked)
> +{
>       struct inpcbhead *head;
>       const struct in6_addr *key1, *key2;
>       struct inpcb *inp;
> @@ -604,7 +622,7 @@ in6_pcblookup_listen(struct inpcbtable *
>        * repeated accesses are quicker.  This is analogous to
>        * the historic single-entry PCB cache.
>        */
> -     if (inp != NULL && inp != LIST_FIRST(head)) {
> +     if (wlocked && inp != NULL && inp != LIST_FIRST(head)) {
>               LIST_REMOVE(inp, inp_hash);
>               LIST_INSERT_HEAD(head, inp, inp_hash);
>       }
> 

Reply via email to