On Fri, Dec 03, 2021 at 08:35:45PM +0100, Alexander Bluhm wrote:
> Hi,
>
> After implementing MP safe refcounting for IPsec TDB, I wonder how
> stable my diff for forwarding on multiple CPU is.
>
> Note that IPsec still has the workaround to disable multiple queues.
> We do not have a mutex that protects the TDB fields yet. We have
> only fixed the memory management.
>
> My goal is to get real MP pressure on the lower part of the IP
> network stack. Only this will show remaining bugs.
>
> bluhm
>
This unlocked `ipsec_in_use' check doesn't work.
SADB_X_ADDFLOW and `ipsec_in_use' increment are netlock serialized.
> if_input_process(struct ifnet *ifp, struct mbuf_list *ml)
> {
> struct mbuf *m;
> + int exclusive_lock = 0;
>
> if (ml_empty(ml))
> return;
> @@ -834,15 +839,27 @@ if_input_process(struct ifnet *ifp, stru
> * lists and the socket layer.
> */
>
> +#ifdef IPSEC
> /*
> * XXXSMP IPsec data structures are not ready to be accessed
> * by multiple network threads in parallel. In this case
> * use an exclusive lock.
> */
> - NET_LOCK();
> + if (ipsec_in_use)
> + exclusive_lock = 1;
> +#endif
Concurrent SADB_X_ADDFLOW threads could add some SA between your
unlocked `ipsec_in_use' check and the following shared netlock. This
time `ipsec_in_use' modification is netlock serialized but you could add
the new rwlock(9). Since we are only interesting in non-null value of
`ipsec_in_use' it's enough to add it here and around SADB_X_ADDFLOW case
within pfkeyv2_send().
I already posted the fix for this diff to your old [1] thread.
> + if (exclusive_lock)
> + NET_LOCK();
> + else
> + NET_RLOCK_IN_SOFTNET();
> +
> while ((m = ml_dequeue(ml)) != NULL)
> (*ifp->if_input)(ifp, m);
1. https://marc.info/?l=openbsd-tech&m=162698721127298&w=2
> Index: net/if.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/net/if.c,v
> retrieving revision 1.644
> diff -u -p -r1.644 if.c
> --- net/if.c 11 Nov 2021 10:03:10 -0000 1.644
> +++ net/if.c 3 Dec 2021 18:51:29 -0000
> @@ -108,6 +108,10 @@
> #include <netinet6/ip6_var.h>
> #endif
>
> +#ifdef IPSEC
> +#include <netinet/ip_ipsp.h>
> +#endif
> +
> #ifdef MPLS
> #include <netmpls/mpls.h>
> #endif
> @@ -237,7 +241,7 @@ int ifq_congestion;
>
> int netisr;
>
> -#define NET_TASKQ 1
> +#define NET_TASKQ 4
> struct taskq *nettqmp[NET_TASKQ];
>
> struct task if_input_task_locked = TASK_INITIALIZER(if_netisr, NULL);
> @@ -814,6 +818,7 @@ void
> if_input_process(struct ifnet *ifp, struct mbuf_list *ml)
> {
> struct mbuf *m;
> + int exclusive_lock = 0;
>
> if (ml_empty(ml))
> return;
> @@ -834,15 +839,27 @@ if_input_process(struct ifnet *ifp, stru
> * lists and the socket layer.
> */
>
> +#ifdef IPSEC
> /*
> * XXXSMP IPsec data structures are not ready to be accessed
> * by multiple network threads in parallel. In this case
> * use an exclusive lock.
> */
> - NET_LOCK();
> + if (ipsec_in_use)
> + exclusive_lock = 1;
> +#endif
> + if (exclusive_lock)
> + NET_LOCK();
> + else
> + NET_RLOCK_IN_SOFTNET();
> +
> while ((m = ml_dequeue(ml)) != NULL)
> (*ifp->if_input)(ifp, m);
> - NET_UNLOCK();
> +
> + if (exclusive_lock)
> + NET_UNLOCK();
> + else
> + NET_RUNLOCK_IN_SOFTNET();
> }
>
> void
> @@ -899,6 +916,12 @@ if_netisr(void *unused)
> arpintr();
> KERNEL_UNLOCK();
> }
> +#endif
> + if (n & (1 << NETISR_IP))
> + ipintr();
> +#ifdef INET6
> + if (n & (1 << NETISR_IPV6))
> + ip6intr();
> #endif
> #if NPPP > 0
> if (n & (1 << NETISR_PPP)) {
> Index: net/if_ethersubr.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/net/if_ethersubr.c,v
> retrieving revision 1.276
> diff -u -p -r1.276 if_ethersubr.c
> --- net/if_ethersubr.c 19 Aug 2021 10:22:00 -0000 1.276
> +++ net/if_ethersubr.c 3 Dec 2021 18:51:29 -0000
> @@ -222,7 +222,10 @@ ether_resolve(struct ifnet *ifp, struct
>
> switch (af) {
> case AF_INET:
> + KERNEL_LOCK();
> + /* XXXSMP there is a MP race in arpresolve() */
> error = arpresolve(ifp, rt, m, dst, eh->ether_dhost);
> + KERNEL_UNLOCK();
> if (error)
> return (error);
> eh->ether_type = htons(ETHERTYPE_IP);
> @@ -245,7 +248,10 @@ ether_resolve(struct ifnet *ifp, struct
> break;
> #ifdef INET6
> case AF_INET6:
> + KERNEL_LOCK();
> + /* XXXSMP there is a MP race in nd6_resolve() */
> error = nd6_resolve(ifp, rt, m, dst, eh->ether_dhost);
> + KERNEL_UNLOCK();
> if (error)
> return (error);
> eh->ether_type = htons(ETHERTYPE_IPV6);
> @@ -271,13 +277,19 @@ ether_resolve(struct ifnet *ifp, struct
> break;
> #ifdef INET6
> case AF_INET6:
> + KERNEL_LOCK();
> + /* XXXSMP there is a MP race in nd6_resolve() */
> error = nd6_resolve(ifp, rt, m, dst, eh->ether_dhost);
> + KERNEL_UNLOCK();
> if (error)
> return (error);
> break;
> #endif
> case AF_INET:
> + KERNEL_LOCK();
> + /* XXXSMP there is a MP race in arpresolve() */
> error = arpresolve(ifp, rt, m, dst, eh->ether_dhost);
> + KERNEL_UNLOCK();
> if (error)
> return (error);
> break;
> @@ -529,12 +541,14 @@ ether_input(struct ifnet *ifp, struct mb
> case ETHERTYPE_PPPOE:
> if (m->m_flags & (M_MCAST | M_BCAST))
> goto dropanyway;
> + KERNEL_LOCK();
> #ifdef PIPEX
> if (pipex_enable) {
> struct pipex_session *session;
>
> if ((session = pipex_pppoe_lookup_session(m)) != NULL) {
> pipex_pppoe_input(m, session);
> + KERNEL_UNLOCK();
> return;
> }
> }
> @@ -543,6 +557,7 @@ ether_input(struct ifnet *ifp, struct mb
> pppoe_disc_input(m);
> else
> pppoe_data_input(m);
> + KERNEL_UNLOCK();
> return;
> #endif
> #ifdef MPLS
> Index: net/ifq.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/net/ifq.c,v
> retrieving revision 1.44
> diff -u -p -r1.44 ifq.c
> --- net/ifq.c 9 Jul 2021 01:22:05 -0000 1.44
> +++ net/ifq.c 3 Dec 2021 18:51:29 -0000
> @@ -243,7 +243,7 @@ void
> ifq_init(struct ifqueue *ifq, struct ifnet *ifp, unsigned int idx)
> {
> ifq->ifq_if = ifp;
> - ifq->ifq_softnet = net_tq(ifp->if_index); /* + idx */
> + ifq->ifq_softnet = net_tq(ifp->if_index + idx);
> ifq->ifq_softc = NULL;
>
> mtx_init(&ifq->ifq_mtx, IPL_NET);
> @@ -620,7 +620,7 @@ void
> ifiq_init(struct ifiqueue *ifiq, struct ifnet *ifp, unsigned int idx)
> {
> ifiq->ifiq_if = ifp;
> - ifiq->ifiq_softnet = net_tq(ifp->if_index); /* + idx */
> + ifiq->ifiq_softnet = net_tq(ifp->if_index + idx);
> ifiq->ifiq_softc = NULL;
>
> mtx_init(&ifiq->ifiq_mtx, IPL_NET);
> Index: net/netisr.h
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/net/netisr.h,v
> retrieving revision 1.55
> diff -u -p -r1.55 netisr.h
> --- net/netisr.h 5 Jan 2021 20:43:36 -0000 1.55
> +++ net/netisr.h 3 Dec 2021 18:51:29 -0000
> @@ -41,8 +41,10 @@
> * interrupt used for scheduling the network code to calls
> * on the lowest level routine of each protocol.
> */
> +#define NETISR_IP 2 /* same as AF_INET */
> #define NETISR_PFSYNC 5 /* for pfsync "immediate" tx */
> #define NETISR_ARP 18 /* same as AF_LINK */
> +#define NETISR_IPV6 24 /* same as AF_INET6 */
> #define NETISR_PPP 28 /* for PPP processing */
> #define NETISR_BRIDGE 29 /* for bridge processing */
> #define NETISR_SWITCH 31 /* for switch dataplane */
> @@ -57,6 +59,8 @@ extern int netisr; /* scheduling bits
> extern struct task if_input_task_locked;
>
> void arpintr(void);
> +void ipintr(void);
> +void ip6intr(void);
> void pppintr(void);
> void bridgeintr(void);
> void switchintr(void);
> Index: netinet/ip_input.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_input.c,v
> retrieving revision 1.364
> diff -u -p -r1.364 ip_input.c
> --- netinet/ip_input.c 22 Nov 2021 13:47:10 -0000 1.364
> +++ netinet/ip_input.c 3 Dec 2021 18:52:11 -0000
> @@ -130,6 +130,8 @@ const struct sysctl_bounded_args ipctl_v
> { IPCTL_ARPDOWN, &arpt_down, 0, INT_MAX },
> };
>
> +struct niqueue ipintrq = NIQUEUE_INITIALIZER(IPQ_MAXLEN, NETISR_IP);
> +
> struct pool ipqent_pool;
> struct pool ipq_pool;
>
> @@ -143,6 +145,7 @@ static struct mbuf_queue ipsendraw_mq;
> extern struct niqueue arpinq;
>
> int ip_ours(struct mbuf **, int *, int, int);
> +int ip_local(struct mbuf **, int *, int, int);
> int ip_dooptions(struct mbuf *, struct ifnet *);
> int in_ouraddr(struct mbuf *, struct ifnet *, struct rtentry **);
>
> @@ -230,6 +233,43 @@ ip_init(void)
> }
>
> /*
> + * Enqueue packet for local delivery. Queuing is used as a boundary
> + * between the network layer (input/forward path) running with shared
> + * NET_RLOCK_IN_SOFTNET() and the transport layer needing it exclusively.
> + */
> +int
> +ip_ours(struct mbuf **mp, int *offp, int nxt, int af)
> +{
> + /* We are already in a IPv4/IPv6 local deliver loop. */
> + if (af != AF_UNSPEC)
> + return ip_local(mp, offp, nxt, af);
> +
> + niq_enqueue(&ipintrq, *mp);
> + *mp = NULL;
> + return IPPROTO_DONE;
> +}
> +
> +/*
> + * Dequeue and process locally delivered packets.
> + */
> +void
> +ipintr(void)
> +{
> + struct mbuf *m;
> + int off, nxt;
> +
> + while ((m = niq_dequeue(&ipintrq)) != NULL) {
> +#ifdef DIAGNOSTIC
> + if ((m->m_flags & M_PKTHDR) == 0)
> + panic("ipintr no HDR");
> +#endif
> + off = 0;
> + nxt = ip_local(&m, &off, IPPROTO_IPV4, AF_UNSPEC);
> + KASSERT(nxt == IPPROTO_DONE);
> + }
> +}
> +
> +/*
> * IPv4 input routine.
> *
> * Checksum and byte swap header. Process options. Forward or deliver.
> @@ -514,7 +554,7 @@ ip_input_if(struct mbuf **mp, int *offp,
> * If fragmented try to reassemble. Pass to next level.
> */
> int
> -ip_ours(struct mbuf **mp, int *offp, int nxt, int af)
> +ip_local(struct mbuf **mp, int *offp, int nxt, int af)
> {
> struct mbuf *m = *mp;
> struct ip *ip = mtod(m, struct ip *);
> @@ -1663,7 +1703,8 @@ ip_sysctl(int *name, u_int namelen, void
> newlen));
> #endif
> case IPCTL_IFQUEUE:
> - return (EOPNOTSUPP);
> + return (sysctl_niq(name + 1, namelen - 1,
> + oldp, oldlenp, newp, newlen, &ipintrq));
> case IPCTL_ARPQUEUE:
> return (sysctl_niq(name + 1, namelen - 1,
> oldp, oldlenp, newp, newlen, &arpinq));
> Index: netinet/ip_var.h
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_var.h,v
> retrieving revision 1.88
> diff -u -p -r1.88 ip_var.h
> --- netinet/ip_var.h 30 Mar 2021 08:37:11 -0000 1.88
> +++ netinet/ip_var.h 3 Dec 2021 18:51:29 -0000
> @@ -248,7 +248,6 @@ void ip_stripoptions(struct mbuf *);
> int ip_sysctl(int *, u_int, void *, size_t *, void *, size_t);
> void ip_savecontrol(struct inpcb *, struct mbuf **, struct ip *,
> struct mbuf *);
> -void ipintr(void);
> int ip_input_if(struct mbuf **, int *, int, int, struct ifnet *);
> int ip_deliver(struct mbuf **, int *, int, int);
> void ip_forward(struct mbuf *, struct ifnet *, struct rtentry *, int);
> Index: netinet6/ip6_input.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/ip6_input.c,v
> retrieving revision 1.237
> diff -u -p -r1.237 ip6_input.c
> --- netinet6/ip6_input.c 3 Jun 2021 04:47:54 -0000 1.237
> +++ netinet6/ip6_input.c 3 Dec 2021 18:52:28 -0000
> @@ -115,11 +115,14 @@
> #include <netinet/ip_carp.h>
> #endif
>
> +struct niqueue ip6intrq = NIQUEUE_INITIALIZER(IPQ_MAXLEN, NETISR_IPV6);
> +
> struct cpumem *ip6counters;
>
> uint8_t ip6_soiikey[IP6_SOIIKEY_LEN];
>
> int ip6_ours(struct mbuf **, int *, int, int);
> +int ip6_local(struct mbuf **, int *, int, int);
> int ip6_check_rh0hdr(struct mbuf *, int *);
> int ip6_hbhchcheck(struct mbuf *, int *, int *, int *);
> int ip6_hopopts_input(u_int32_t *, u_int32_t *, struct mbuf **, int *);
> @@ -162,6 +165,43 @@ ip6_init(void)
> ip6counters = counters_alloc(ip6s_ncounters);
> }
>
> +/*
> + * Enqueue packet for local delivery. Queuing is used as a boundary
> + * between the network layer (input/forward path) running with shared
> + * NET_RLOCK_IN_SOFTNET() and the transport layer needing it exclusively.
> + */
> +int
> +ip6_ours(struct mbuf **mp, int *offp, int nxt, int af)
> +{
> + /* We are already in a IPv4/IPv6 local deliver loop. */
> + if (af != AF_UNSPEC)
> + return ip6_local(mp, offp, nxt, af);
> +
> + niq_enqueue(&ip6intrq, *mp);
> + *mp = NULL;
> + return IPPROTO_DONE;
> +}
> +
> +/*
> + * Dequeue and process locally delivered packets.
> + */
> +void
> +ip6intr(void)
> +{
> + struct mbuf *m;
> + int off, nxt;
> +
> + while ((m = niq_dequeue(&ip6intrq)) != NULL) {
> +#ifdef DIAGNOSTIC
> + if ((m->m_flags & M_PKTHDR) == 0)
> + panic("ip6intr no HDR");
> +#endif
> + off = 0;
> + nxt = ip6_local(&m, &off, IPPROTO_IPV6, AF_UNSPEC);
> + KASSERT(nxt == IPPROTO_DONE);
> + }
> +}
> +
> void
> ipv6_input(struct ifnet *ifp, struct mbuf *m)
> {
> @@ -544,7 +584,7 @@ ip6_input_if(struct mbuf **mp, int *offp
> }
>
> int
> -ip6_ours(struct mbuf **mp, int *offp, int nxt, int af)
> +ip6_local(struct mbuf **mp, int *offp, int nxt, int af)
> {
> if (ip6_hbhchcheck(*mp, offp, &nxt, NULL))
> return IPPROTO_DONE;
> @@ -1470,7 +1510,8 @@ ip6_sysctl(int *name, u_int namelen, voi
> NET_UNLOCK();
> return (error);
> case IPV6CTL_IFQUEUE:
> - return (EOPNOTSUPP);
> + return (sysctl_niq(name + 1, namelen - 1,
> + oldp, oldlenp, newp, newlen, &ip6intrq));
> case IPV6CTL_SOIIKEY:
> return (ip6_sysctl_soiikey(oldp, oldlenp, newp, newlen));
> default:
>