Added matching of CPU to a socket CPU mask. This is useful for TCP listeners and unconnected UDP. This works with SO_REUSEPORT to steer packets to listener sockets based on CPU affinity.
In this patch: - Add SO_INCOMING_CPU_MASK - Add a CPU mask pointer to struct sock - Get/setsockopt to get/set a the mask on a socket - Compat functions for the sockopts - Add sk_match_incoming_cpu_mask to check is running CPU is in a mask for a socket - Call sk_match_incoming_cpu_mask from inet compute_score and UDP functions for IPv4 and IPv6 Signed-off-by: Tom Herbert <t...@herbertland.com> --- arch/alpha/include/uapi/asm/socket.h | 2 + arch/avr32/include/uapi/asm/socket.h | 2 + arch/cris/include/uapi/asm/socket.h | 2 + arch/frv/include/uapi/asm/socket.h | 2 + arch/ia64/include/uapi/asm/socket.h | 2 + arch/m32r/include/uapi/asm/socket.h | 2 + arch/mips/include/uapi/asm/socket.h | 2 + arch/mn10300/include/uapi/asm/socket.h | 2 + arch/parisc/include/uapi/asm/socket.h | 2 + arch/powerpc/include/uapi/asm/socket.h | 2 + arch/s390/include/uapi/asm/socket.h | 2 + arch/sparc/include/uapi/asm/socket.h | 2 + arch/xtensa/include/uapi/asm/socket.h | 2 + include/net/sock.h | 31 +++++++++++++ include/uapi/asm-generic/socket.h | 2 + net/compat.c | 56 ++++++++++++++++++++++++ net/core/sock.c | 80 ++++++++++++++++++++++++++++++++++ net/ipv4/inet_hashtables.c | 3 ++ net/ipv4/udp.c | 6 +++ net/ipv6/inet6_hashtables.c | 3 ++ net/ipv6/udp.c | 3 ++ 21 files changed, 210 insertions(+) diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h index 9a20821..eae65a2 100644 --- a/arch/alpha/include/uapi/asm/socket.h +++ b/arch/alpha/include/uapi/asm/socket.h @@ -92,4 +92,6 @@ #define SO_ATTACH_BPF 50 #define SO_DETACH_BPF SO_DETACH_FILTER +#define SO_INCOMING_CPU_MASK 51 + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/avr32/include/uapi/asm/socket.h b/arch/avr32/include/uapi/asm/socket.h index 2b65ed6..89515e3 100644 --- a/arch/avr32/include/uapi/asm/socket.h +++ b/arch/avr32/include/uapi/asm/socket.h @@ -85,4 +85,6 @@ #define SO_ATTACH_BPF 50 #define SO_DETACH_BPF SO_DETACH_FILTER +#define SO_INCOMING_CPU_MASK 51 + #endif /* _UAPI__ASM_AVR32_SOCKET_H */ diff --git a/arch/cris/include/uapi/asm/socket.h b/arch/cris/include/uapi/asm/socket.h index e2503d9f..65fcf0e 100644 --- a/arch/cris/include/uapi/asm/socket.h +++ b/arch/cris/include/uapi/asm/socket.h @@ -87,6 +87,8 @@ #define SO_ATTACH_BPF 50 #define SO_DETACH_BPF SO_DETACH_FILTER +#define SO_INCOMING_CPU_MASK 51 + #endif /* _ASM_SOCKET_H */ diff --git a/arch/frv/include/uapi/asm/socket.h b/arch/frv/include/uapi/asm/socket.h index 4823ad1..1af3b78 100644 --- a/arch/frv/include/uapi/asm/socket.h +++ b/arch/frv/include/uapi/asm/socket.h @@ -85,5 +85,7 @@ #define SO_ATTACH_BPF 50 #define SO_DETACH_BPF SO_DETACH_FILTER +#define SO_INCOMING_CPU_MASK 51 + #endif /* _ASM_SOCKET_H */ diff --git a/arch/ia64/include/uapi/asm/socket.h b/arch/ia64/include/uapi/asm/socket.h index 59be3d8..7ef59d3 100644 --- a/arch/ia64/include/uapi/asm/socket.h +++ b/arch/ia64/include/uapi/asm/socket.h @@ -94,4 +94,6 @@ #define SO_ATTACH_BPF 50 #define SO_DETACH_BPF SO_DETACH_FILTER +#define SO_INCOMING_CPU_MASK 51 + #endif /* _ASM_IA64_SOCKET_H */ diff --git a/arch/m32r/include/uapi/asm/socket.h b/arch/m32r/include/uapi/asm/socket.h index 7bc4cb2..53a697c 100644 --- a/arch/m32r/include/uapi/asm/socket.h +++ b/arch/m32r/include/uapi/asm/socket.h @@ -85,4 +85,6 @@ #define SO_ATTACH_BPF 50 #define SO_DETACH_BPF SO_DETACH_FILTER +#define SO_INCOMING_CPU_MASK 51 + #endif /* _ASM_M32R_SOCKET_H */ diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h index dec3c85..063d59d 100644 --- a/arch/mips/include/uapi/asm/socket.h +++ b/arch/mips/include/uapi/asm/socket.h @@ -103,4 +103,6 @@ #define SO_ATTACH_BPF 50 #define SO_DETACH_BPF SO_DETACH_FILTER +#define SO_INCOMING_CPU_MASK 51 + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/mn10300/include/uapi/asm/socket.h b/arch/mn10300/include/uapi/asm/socket.h index cab7d6d..3c9f8e9 100644 --- a/arch/mn10300/include/uapi/asm/socket.h +++ b/arch/mn10300/include/uapi/asm/socket.h @@ -85,4 +85,6 @@ #define SO_ATTACH_BPF 50 #define SO_DETACH_BPF SO_DETACH_FILTER +#define SO_INCOMING_CPU_MASK 51 + #endif /* _ASM_SOCKET_H */ diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h index a5cd40c..557a09b 100644 --- a/arch/parisc/include/uapi/asm/socket.h +++ b/arch/parisc/include/uapi/asm/socket.h @@ -84,4 +84,6 @@ #define SO_ATTACH_BPF 0x402B #define SO_DETACH_BPF SO_DETACH_FILTER +#define SO_INCOMING_CPU_MASK 0x402C + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/powerpc/include/uapi/asm/socket.h b/arch/powerpc/include/uapi/asm/socket.h index c046666..a72fac6 100644 --- a/arch/powerpc/include/uapi/asm/socket.h +++ b/arch/powerpc/include/uapi/asm/socket.h @@ -92,4 +92,6 @@ #define SO_ATTACH_BPF 50 #define SO_DETACH_BPF SO_DETACH_FILTER +#define SO_INCOMING_CPU_MASK 51 + #endif /* _ASM_POWERPC_SOCKET_H */ diff --git a/arch/s390/include/uapi/asm/socket.h b/arch/s390/include/uapi/asm/socket.h index 296942d..b901044 100644 --- a/arch/s390/include/uapi/asm/socket.h +++ b/arch/s390/include/uapi/asm/socket.h @@ -91,4 +91,6 @@ #define SO_ATTACH_BPF 50 #define SO_DETACH_BPF SO_DETACH_FILTER +#define SO_INCOMING_CPU_MASK 51 + #endif /* _ASM_SOCKET_H */ diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h index e6a16c4..95835a1 100644 --- a/arch/sparc/include/uapi/asm/socket.h +++ b/arch/sparc/include/uapi/asm/socket.h @@ -81,6 +81,8 @@ #define SO_ATTACH_BPF 0x0034 #define SO_DETACH_BPF SO_DETACH_FILTER +#define SO_INCOMING_CPU_MASK 0x0035 + /* Security levels - as per NRL IPv6 - don't actually do anything */ #define SO_SECURITY_AUTHENTICATION 0x5001 #define SO_SECURITY_ENCRYPTION_TRANSPORT 0x5002 diff --git a/arch/xtensa/include/uapi/asm/socket.h b/arch/xtensa/include/uapi/asm/socket.h index 4120af0..0167812 100644 --- a/arch/xtensa/include/uapi/asm/socket.h +++ b/arch/xtensa/include/uapi/asm/socket.h @@ -96,4 +96,6 @@ #define SO_ATTACH_BPF 50 #define SO_DETACH_BPF SO_DETACH_FILTER +#define SO_INCOMING_CPU_MASK 51 + #endif /* _XTENSA_SOCKET_H */ diff --git a/include/net/sock.h b/include/net/sock.h index bcf6114..8407c3b 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -123,6 +123,11 @@ typedef struct { #endif } socket_lock_t; +struct rcu_cpumask { + struct rcu_head rcu; + unsigned long cpumask[0]; +}; + struct sock; struct proto; struct net; @@ -150,6 +155,7 @@ typedef __u64 __bitwise __addrpair; * @skc_node: main hash linkage for various protocol lookup tables * @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol * @skc_tx_queue_mapping: tx queue number for this connection + * @skc_incoming_cpu_mask: CPU mask for listeners * @skc_refcnt: reference count * * This is the minimal network layer representation of sockets, the header @@ -212,9 +218,12 @@ struct sock_common { struct hlist_nulls_node skc_nulls_node; }; + struct rcu_cpumask __rcu *skc_incoming_cpu_mask; + /* Cachelines above this point are read mostly and are used in socket * lookup. */ + int skc_tx_queue_mapping ____cacheline_aligned_in_smp; @@ -314,6 +323,7 @@ struct sock { #define sk_node __sk_common.skc_node #define sk_nulls_node __sk_common.skc_nulls_node #define sk_refcnt __sk_common.skc_refcnt +#define sk_incoming_cpu_mask __sk_common.skc_incoming_cpu_mask #define sk_tx_queue_mapping __sk_common.skc_tx_queue_mapping #define sk_dontcopy_begin __sk_common.skc_dontcopy_begin @@ -2220,6 +2230,27 @@ static inline bool sk_fullsock(const struct sock *sk) return (1 << sk->sk_state) & ~(TCPF_TIME_WAIT | TCPF_NEW_SYN_RECV); } +static inline bool sk_match_incoming_cpu_mask(const struct sock *sk) +{ + struct rcu_cpumask *mask; + bool ret = false; + + if (!sk->sk_incoming_cpu_mask) + return ret; + + rcu_read_lock(); + + mask = rcu_dereference(sk->sk_incoming_cpu_mask); + if (likely(mask) && + cpumask_test_cpu(raw_smp_processor_id(), + to_cpumask(mask->cpumask))) + ret = true; + + rcu_read_unlock(); + + return ret; +} + void sock_enable_timestamp(struct sock *sk, int flag); int sock_get_timestamp(struct sock *, struct timeval __user *); int sock_get_timestampns(struct sock *, struct timespec __user *); diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h index 5c15c2a..d41c8b9 100644 --- a/include/uapi/asm-generic/socket.h +++ b/include/uapi/asm-generic/socket.h @@ -87,4 +87,6 @@ #define SO_ATTACH_BPF 50 #define SO_DETACH_BPF SO_DETACH_FILTER +#define SO_INCOMING_CPU_MASK 51 + #endif /* __ASM_GENERIC_SOCKET_H */ diff --git a/net/compat.c b/net/compat.c index 5cfd26a..f9fc5ce 100644 --- a/net/compat.c +++ b/net/compat.c @@ -351,6 +351,23 @@ static int do_set_sock_timeout(struct socket *sock, int level, return err; } +static int do_set_incoming_cpu_mask(struct socket *sock, int level, + int optname, char __user *optval, unsigned int optlen) +{ + compat_ulong_t __user *user_mask_ptr = + (compat_ulong_t __user *)optval; + struct cpumask __user *mask = compat_alloc_user_space(cpumask_size()); + int err; + + err = compat_get_user_cpu_mask(user_mask_ptr, optlen, mask); + if (err) + return err; + + return sock_setsockopt(sock, level, optname, + (char __user *)cpumask_bits(mask), + cpumask_size()); +} + static int compat_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) { @@ -360,6 +377,10 @@ static int compat_sock_setsockopt(struct socket *sock, int level, int optname, if (optname == SO_RCVTIMEO || optname == SO_SNDTIMEO) return do_set_sock_timeout(sock, level, optname, optval, optlen); + if (optname == SO_INCOMING_CPU_MASK) + return do_set_incoming_cpu_mask(sock, level, optname, + optval, optlen); + return sock_setsockopt(sock, level, optname, optval, optlen); } @@ -419,11 +440,46 @@ static int do_get_sock_timeout(struct socket *sock, int level, int optname, return err; } +static int do_get_incoming_cpu_mask(struct socket *sock, int level, + int optname, char __user *optval, unsigned int __user *optlen) +{ + compat_ulong_t __user *user_mask_ptr = + (compat_ulong_t __user *)optval; + struct cpumask __user *mask = compat_alloc_user_space(cpumask_size()); + int len, err; + + if (get_user(len, optlen)) + return -EFAULT; + + if ((len * BITS_PER_BYTE) < nr_cpu_ids) + return -EINVAL; + if (len & (sizeof(compat_ulong_t) - 1)) + return -EINVAL; + + if (put_user(cpumask_size(), optlen)) + return -EFAULT; + + err = sock_getsockopt(sock, level, optname, + (char __user *)cpumask_bits(mask), optlen); + if (err == 0) + if (get_user(len, optlen) || + compat_put_bitmap(user_mask_ptr, + cpumask_bits(mask), len * 8)) + err = -EFAULT; + + return err; +} + static int compat_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { if (optname == SO_RCVTIMEO || optname == SO_SNDTIMEO) return do_get_sock_timeout(sock, level, optname, optval, optlen); + + if (optname == SO_INCOMING_CPU_MASK) + return do_get_incoming_cpu_mask(sock, level, optname, + optval, optlen); + return sock_getsockopt(sock, level, optname, optval, optlen); } diff --git a/net/core/sock.c b/net/core/sock.c index 29124fc..25fc8a7 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -672,6 +672,71 @@ bool sk_mc_loop(struct sock *sk) } EXPORT_SYMBOL(sk_mc_loop); +static int do_set_incoming_cpu_mask(struct sock *sk, char __user *optval, + unsigned int optlen) +{ + struct rcu_cpumask *new_mask, *old_mask; + unsigned long *k; + + old_mask = rcu_dereference_protected(sk->sk_incoming_cpu_mask, + sock_owned_by_user(sk)); + + if (optlen == 0) { + RCU_INIT_POINTER(sk->sk_incoming_cpu_mask, NULL); + if (old_mask) { + kfree_rcu(old_mask, rcu); + return 0; + } + } + + if (optlen & (sizeof(unsigned long) - 1)) + return -EINVAL; + + new_mask = kzalloc(sizeof(*new_mask) + cpumask_size(), GFP_KERNEL); + if (!new_mask) + return -ENOMEM; + + k = cpumask_bits(to_cpumask(new_mask->cpumask)); + if (copy_from_user(k, optval, min_t(int, optlen, cpumask_size()))) + return -EFAULT; + + rcu_assign_pointer(sk->sk_incoming_cpu_mask, new_mask); + + if (old_mask) + kfree_rcu(old_mask, rcu); + + return 0; +} + +static int do_get_incoming_cpu_mask(struct sock *sk, char __user *optval, + unsigned int __user *optlen, + unsigned int len) +{ + struct rcu_cpumask *mask; + unsigned long *k; + int err = 0; + + if (len < cpumask_size()) + return -EINVAL; + + if (len & (sizeof(unsigned long) - 1)) + return -EINVAL; + + rcu_read_lock(); + + mask = rcu_dereference(sk->sk_incoming_cpu_mask); + + k = cpumask_bits(to_cpumask(mask->cpumask)); + if (copy_to_user(optval, k, cpumask_size())) + err = -EFAULT; + else + put_user(cpumask_size(), optlen); + + rcu_read_unlock(); + + return err; +} + /* * This is meant for all protocols to use and covers goings on * at the socket level. Everything here is generic. @@ -990,6 +1055,10 @@ set_rcvbuf: sk->sk_max_pacing_rate); break; + case SO_INCOMING_CPU_MASK: + ret = do_set_incoming_cpu_mask(sk, optval, optlen); + break; + default: ret = -ENOPROTOOPT; break; @@ -1250,6 +1319,9 @@ int sock_getsockopt(struct socket *sock, int level, int optname, v.val = sk->sk_incoming_cpu; break; + case SO_INCOMING_CPU_MASK: + return do_get_incoming_cpu_mask(sk, optval, optlen, len); + default: /* We implement the SO_SNDLOWAT etc to not be settable * (1003.1g 7). @@ -1429,6 +1501,7 @@ EXPORT_SYMBOL(sk_alloc); static void __sk_free(struct sock *sk) { struct sk_filter *filter; + struct rcu_cpumask *incoming_cpu_mask; if (sk->sk_destruct) sk->sk_destruct(sk); @@ -1440,6 +1513,12 @@ static void __sk_free(struct sock *sk) RCU_INIT_POINTER(sk->sk_filter, NULL); } + incoming_cpu_mask = rcu_dereference(sk->sk_incoming_cpu_mask); + if (incoming_cpu_mask) { + kfree_rcu(incoming_cpu_mask, rcu); + RCU_INIT_POINTER(sk->sk_incoming_cpu_mask, NULL); + } + sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); if (atomic_read(&sk->sk_omem_alloc)) @@ -1543,6 +1622,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk->sk_err = 0; newsk->sk_priority = 0; newsk->sk_incoming_cpu = raw_smp_processor_id(); + RCU_INIT_POINTER(newsk->sk_incoming_cpu_mask, NULL); atomic64_set(&newsk->sk_cookie, 0); /* * Before updating sk_refcnt, we must commit prior changes to memory diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 3766bdd..2e9a95f 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -184,6 +184,9 @@ static inline int compute_score(struct sock *sk, struct net *net, return -1; score += 4; } + + if (sk_match_incoming_cpu_mask(sk)) + score += 4; } return score; } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index d10b7e0..dc6a3da 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -375,6 +375,9 @@ static inline int compute_score(struct sock *sk, struct net *net, score += 4; } + if (sk_match_incoming_cpu_mask(sk)) + score += 4; + return score; } @@ -418,6 +421,9 @@ static inline int compute_score2(struct sock *sk, struct net *net, score += 4; } + if (sk_match_incoming_cpu_mask(sk)) + score += 4; + return score; } diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 871641b..8cc4ba9 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -114,6 +114,9 @@ static inline int compute_score(struct sock *sk, struct net *net, return -1; score++; } + + if (sk_match_incoming_cpu_mask(sk)) + score += 4; } return score; } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index c2ec416..a0c9a80 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -182,6 +182,9 @@ static inline int compute_score(struct sock *sk, struct net *net, score++; } + if (sk_match_incoming_cpu_mask(sk)) + score++; + return score; } -- 1.8.1 -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html