On 6/16/17, 6:27 AM, "Daniel Borkmann" <dan...@iogearbox.net> wrote:
On 06/15/2017 10:08 PM, Lawrence Brakmo wrote: > Added support for calling a subset of socket setsockopts from > BPF_PROG_TYPE_SOCKET_OPS programs. The code was duplicated rather > than making the changes to call the socket setsockopt function because > the changes required would have been larger. > > The ops supported are: > SO_RCVBUF > SO_SNDBUF > SO_MAX_PACING_RATE > SO_PRIORITY > SO_RCVLOWAT > SO_MARK > > Signed-off-by: Lawrence Brakmo <bra...@fb.com> > --- > include/uapi/linux/bpf.h | 14 ++++++++- > net/core/filter.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++- > samples/bpf/bpf_helpers.h | 3 ++ > 3 files changed, 92 insertions(+), 2 deletions(-) > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index d945336..8accb4d 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -520,6 +520,17 @@ union bpf_attr { > * Set full skb->hash. > * @skb: pointer to skb > * @hash: hash to set > + * > + * int bpf_setsockopt(bpf_socket, level, optname, optval, optlen) > + * Calls setsockopt. Not all opts are available, only those with > + * integer optvals plus TCP_CONGESTION. > + * Supported levels: SOL_SOCKET and IPROTO_TCP > + * @bpf_socket: pointer to bpf_socket > + * @level: SOL_SOCKET or IPROTO_TCP > + * @optname: option name > + * @optval: pointer to option value > + * @optlen: length of optval in byes > + * Return: 0 or negative error > */ > #define __BPF_FUNC_MAPPER(FN) \ > FN(unspec), \ > @@ -570,7 +581,8 @@ union bpf_attr { > FN(probe_read_str), \ > FN(get_socket_cookie), \ > FN(get_socket_uid), \ > - FN(set_hash), > + FN(set_hash), \ > + FN(setsockopt), > > /* integer value in 'imm' field of BPF_CALL instruction selects which helper > * function eBPF program intends to call > diff --git a/net/core/filter.c b/net/core/filter.c > index 7466f55..9ff567c 100644 > --- a/net/core/filter.c > +++ b/net/core/filter.c > @@ -54,6 +54,7 @@ > #include <net/dst.h> > #include <net/sock_reuseport.h> > #include <net/busy_poll.h> > +#include <net/tcp.h> > > /** > * sk_filter_trim_cap - run a packet through a socket filter > @@ -2671,6 +2672,69 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = { > .arg1_type = ARG_PTR_TO_CTX, > }; > > +BPF_CALL_5(bpf_setsockopt, struct bpf_socket_ops_kern *, bpf_socket, > + int, level, int, optname, char *, optval, int, optlen) > +{ > + struct sock *sk = bpf_socket->sk; > + int ret = 0; > + int val; > + > + if (bpf_socket->is_req_sock) > + return -EINVAL; > + > + if (level == SOL_SOCKET) { > + /* Only some socketops are supported */ > + val = *((int *)optval); > + > + switch (optname) { > + case SO_RCVBUF: > + sk->sk_userlocks |= SOCK_RCVBUF_LOCK; > + sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF); > + break; > + case SO_SNDBUF: > + sk->sk_userlocks |= SOCK_SNDBUF_LOCK; > + sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF); > + break; > + case SO_MAX_PACING_RATE: > + sk->sk_max_pacing_rate = val; > + sk->sk_pacing_rate = min(sk->sk_pacing_rate, > + sk->sk_max_pacing_rate); > + break; > + case SO_PRIORITY: > + sk->sk_priority = val; > + break; > + case SO_RCVLOWAT: > + if (val < 0) > + val = INT_MAX; > + sk->sk_rcvlowat = val ? : 1; > + break; > + case SO_MARK: > + sk->sk_mark = val; > + break; Likely all of the above, I think we could make programs even more flexible if these are members in the context struct itself, so that socket_ops_convert_ctx_access() would translate them inline w/o even needing a helper call with the bonus of having read and write access. I initially refactored sock_setsockopt and tcp_setsockopt so they could be called from bpf programs, but it turned out not to be worth it since a lot of them cannot be called from there. These operations should not be called too often (per sock lifetime) so I was not too concerned about efficiency. In addition, some of these are not viable if the sock is a req_sock, and I didn’t think the performance gain would be worth the added complexity. I already have other enhancements in mind that will require access to fields in the tcp_sock structure. I was planning to implement those as you proposed because they would be accessed more often. But what about socket lock that we otherwise have in sock_setsockopt()? My understanding is that the lock is needed because the sock_setsockopt() function is called from user space. The sock_ops BPF programs are being called from within the network stack and I thought a lock was not needed in this case. For example, the call to bh_lock_sock_nested() in tcp_v4_rcv() Could we add a sock_owned_by_me(sk) to tcp_call_bpf() to really assert this for all cases, so that lockdep complains should it ever be not the case? Done (if it is not a req_sock). > + default: > + ret = -EINVAL; > + } > + } else if (level == SOL_TCP && > + bpf_socket->sk->sk_prot->setsockopt == tcp_setsockopt) { > + /* Place holder */ > + ret = -EINVAL; > + } else { > + ret = -EINVAL; > + } > + return ret; > +} > + > +static const struct bpf_func_proto bpf_setsockopt_proto = { > + .func = bpf_setsockopt, > + .gpl_only = true, > + .ret_type = RET_INTEGER, > + .arg1_type = ARG_PTR_TO_CTX, > + .arg2_type = ARG_ANYTHING, > + .arg3_type = ARG_ANYTHING, > + .arg4_type = ARG_PTR_TO_MEM, > + .arg5_type = ARG_CONST_SIZE_OR_ZERO, > +}; > + > static const struct bpf_func_proto * > bpf_base_func_proto(enum bpf_func_id func_id) > { > @@ -2822,6 +2886,17 @@ lwt_inout_func_proto(enum bpf_func_id func_id) > } > > static const struct bpf_func_proto * > + socket_ops_func_proto(enum bpf_func_id func_id) > +{ > + switch (func_id) { > + case BPF_FUNC_setsockopt: > + return &bpf_setsockopt_proto; > + default: > + return bpf_base_func_proto(func_id); > + } > +} > + > +static const struct bpf_func_proto * > lwt_xmit_func_proto(enum bpf_func_id func_id) > { > switch (func_id) { > @@ -3565,7 +3640,7 @@ const struct bpf_verifier_ops cg_sock_prog_ops = { > }; > > const struct bpf_verifier_ops socket_ops_prog_ops = { > - .get_func_proto = bpf_base_func_proto, > + .get_func_proto = socket_ops_func_proto, > .is_valid_access = socket_ops_is_valid_access, > .convert_ctx_access = socket_ops_convert_ctx_access, > }; > diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h > index f4840b8..d50ac34 100644 > --- a/samples/bpf/bpf_helpers.h > +++ b/samples/bpf/bpf_helpers.h > @@ -60,6 +60,9 @@ static unsigned long long (*bpf_get_prandom_u32)(void) = > (void *) BPF_FUNC_get_prandom_u32; > static int (*bpf_xdp_adjust_head)(void *ctx, int offset) = > (void *) BPF_FUNC_xdp_adjust_head; > +static int (*bpf_setsockopt)(void *ctx, int level, int optname, void *optval, > + int optlen) = > + (void *) BPF_FUNC_setsockopt; > > /* llvm builtin functions that eBPF C program may use to > * emit BPF_LD_ABS and BPF_LD_IND instructions >