On 6/16/17, 6:27 AM, "Daniel Borkmann" <dan...@iogearbox.net> wrote:

    On 06/15/2017 10:08 PM, Lawrence Brakmo wrote:
    > Added support for calling a subset of socket setsockopts from
    > BPF_PROG_TYPE_SOCKET_OPS programs. The code was duplicated rather
    > than making the changes to call the socket setsockopt function because
    > the changes required would have been larger.
    >
    > The ops supported are:
    >    SO_RCVBUF
    >    SO_SNDBUF
    >    SO_MAX_PACING_RATE
    >    SO_PRIORITY
    >    SO_RCVLOWAT
    >    SO_MARK
    >
    > Signed-off-by: Lawrence Brakmo <bra...@fb.com>
    > ---
    >   include/uapi/linux/bpf.h  | 14 ++++++++-
    >   net/core/filter.c         | 77 
++++++++++++++++++++++++++++++++++++++++++++++-
    >   samples/bpf/bpf_helpers.h |  3 ++
    >   3 files changed, 92 insertions(+), 2 deletions(-)
    >
    > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
    > index d945336..8accb4d 100644
    > --- a/include/uapi/linux/bpf.h
    > +++ b/include/uapi/linux/bpf.h
    > @@ -520,6 +520,17 @@ union bpf_attr {
    >    *     Set full skb->hash.
    >    *     @skb: pointer to skb
    >    *     @hash: hash to set
    > + *
    > + * int bpf_setsockopt(bpf_socket, level, optname, optval, optlen)
    > + *     Calls setsockopt. Not all opts are available, only those with
    > + *     integer optvals plus TCP_CONGESTION.
    > + *     Supported levels: SOL_SOCKET and IPROTO_TCP
    > + *     @bpf_socket: pointer to bpf_socket
    > + *     @level: SOL_SOCKET or IPROTO_TCP
    > + *     @optname: option name
    > + *     @optval: pointer to option value
    > + *     @optlen: length of optval in byes
    > + *     Return: 0 or negative error
    >    */
    >   #define __BPF_FUNC_MAPPER(FN)           \
    >           FN(unspec),                     \
    > @@ -570,7 +581,8 @@ union bpf_attr {
    >           FN(probe_read_str),             \
    >           FN(get_socket_cookie),          \
    >           FN(get_socket_uid),             \
    > - FN(set_hash),
    > + FN(set_hash),                   \
    > + FN(setsockopt),
    >
    >   /* integer value in 'imm' field of BPF_CALL instruction selects which 
helper
    >    * function eBPF program intends to call
    > diff --git a/net/core/filter.c b/net/core/filter.c
    > index 7466f55..9ff567c 100644
    > --- a/net/core/filter.c
    > +++ b/net/core/filter.c
    > @@ -54,6 +54,7 @@
    >   #include <net/dst.h>
    >   #include <net/sock_reuseport.h>
    >   #include <net/busy_poll.h>
    > +#include <net/tcp.h>
    >
    >   /**
    >    *      sk_filter_trim_cap - run a packet through a socket filter
    > @@ -2671,6 +2672,69 @@ static const struct bpf_func_proto 
bpf_get_socket_uid_proto = {
    >           .arg1_type      = ARG_PTR_TO_CTX,
    >   };
    >
    > +BPF_CALL_5(bpf_setsockopt, struct bpf_socket_ops_kern *, bpf_socket,
    > +    int, level, int, optname, char *, optval, int, optlen)
    > +{
    > + struct sock *sk = bpf_socket->sk;
    > + int ret = 0;
    > + int val;
    > +
    > + if (bpf_socket->is_req_sock)
    > +         return -EINVAL;
    > +
    > + if (level == SOL_SOCKET) {
    > +         /* Only some socketops are supported */
    > +         val = *((int *)optval);
    > +
    > +         switch (optname) {
    > +         case SO_RCVBUF:
    > +                 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
    > +                 sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
    > +                 break;
    > +         case SO_SNDBUF:
    > +                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
    > +                 sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
    > +                 break;
    > +         case SO_MAX_PACING_RATE:
    > +                 sk->sk_max_pacing_rate = val;
    > +                 sk->sk_pacing_rate = min(sk->sk_pacing_rate,
    > +                                          sk->sk_max_pacing_rate);
    > +                 break;
    > +         case SO_PRIORITY:
    > +                 sk->sk_priority = val;
    > +                 break;
    > +         case SO_RCVLOWAT:
    > +                 if (val < 0)
    > +                         val = INT_MAX;
    > +                 sk->sk_rcvlowat = val ? : 1;
    > +                 break;
    > +         case SO_MARK:
    > +                 sk->sk_mark = val;
    > +                 break;
    
    Likely all of the above, I think we could make programs even more
    flexible if these are members in the context struct itself, so
    that socket_ops_convert_ctx_access() would translate them inline
    w/o even needing a helper call with the bonus of having read and
    write access.

I initially refactored sock_setsockopt and tcp_setsockopt so they
could be called from bpf programs, but it turned out not to be
worth it since a lot of them cannot be called from there.
These operations should not be called too often (per sock lifetime)
so I was not too concerned about efficiency. In addition, some of
these are not viable if the sock is a req_sock, and I didn’t think the
performance gain would be worth the added complexity. 

I already have other enhancements in mind that will require
access to fields in the tcp_sock structure. I was planning to
implement those as you proposed because they would be
accessed more often.

    
    But what about socket lock that we otherwise have in sock_setsockopt()?
    
My understanding is that the lock is needed because the sock_setsockopt()
function is called from user space. The sock_ops BPF programs are being
called from within the network stack and I thought a lock was not needed
in this case. For example, the call to bh_lock_sock_nested() in tcp_v4_rcv()

    Could we add a sock_owned_by_me(sk) to tcp_call_bpf() to really assert
    this for all cases, so that lockdep complains should it ever be not
    the case?

Done (if it is not a req_sock).

    > +         default:
    > +                 ret = -EINVAL;
    > +         }
    > + } else if (level == SOL_TCP &&
    > +            bpf_socket->sk->sk_prot->setsockopt == tcp_setsockopt) {
    > +         /* Place holder */
    > +         ret = -EINVAL;
    > + } else {
    > +         ret = -EINVAL;
    > + }
    > + return ret;
    > +}
    > +
    > +static const struct bpf_func_proto bpf_setsockopt_proto = {
    > + .func           = bpf_setsockopt,
    > + .gpl_only       = true,
    > + .ret_type       = RET_INTEGER,
    > + .arg1_type      = ARG_PTR_TO_CTX,
    > + .arg2_type      = ARG_ANYTHING,
    > + .arg3_type      = ARG_ANYTHING,
    > + .arg4_type      = ARG_PTR_TO_MEM,
    > + .arg5_type      = ARG_CONST_SIZE_OR_ZERO,
    > +};
    > +
    >   static const struct bpf_func_proto *
    >   bpf_base_func_proto(enum bpf_func_id func_id)
    >   {
    > @@ -2822,6 +2886,17 @@ lwt_inout_func_proto(enum bpf_func_id func_id)
    >   }
    >
    >   static const struct bpf_func_proto *
    > + socket_ops_func_proto(enum bpf_func_id func_id)
    > +{
    > + switch (func_id) {
    > + case BPF_FUNC_setsockopt:
    > +         return &bpf_setsockopt_proto;
    > + default:
    > +         return bpf_base_func_proto(func_id);
    > + }
    > +}
    > +
    > +static const struct bpf_func_proto *
    >   lwt_xmit_func_proto(enum bpf_func_id func_id)
    >   {
    >           switch (func_id) {
    > @@ -3565,7 +3640,7 @@ const struct bpf_verifier_ops cg_sock_prog_ops = {
    >   };
    >
    >   const struct bpf_verifier_ops socket_ops_prog_ops = {
    > - .get_func_proto         = bpf_base_func_proto,
    > + .get_func_proto         = socket_ops_func_proto,
    >           .is_valid_access        = socket_ops_is_valid_access,
    >           .convert_ctx_access     = socket_ops_convert_ctx_access,
    >   };
    > diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h
    > index f4840b8..d50ac34 100644
    > --- a/samples/bpf/bpf_helpers.h
    > +++ b/samples/bpf/bpf_helpers.h
    > @@ -60,6 +60,9 @@ static unsigned long long (*bpf_get_prandom_u32)(void) =
    >           (void *) BPF_FUNC_get_prandom_u32;
    >   static int (*bpf_xdp_adjust_head)(void *ctx, int offset) =
    >           (void *) BPF_FUNC_xdp_adjust_head;
    > +static int (*bpf_setsockopt)(void *ctx, int level, int optname, void 
*optval,
    > +                      int optlen) =
    > + (void *) BPF_FUNC_setsockopt;
    >
    >   /* llvm builtin functions that eBPF C program may use to
    >    * emit BPF_LD_ABS and BPF_LD_IND instructions
    >
    
    

Reply via email to