Re: [PATCH bpf-next 1/5] bpf: Hooks for sys_sendmsg

Martin KaFai Lau Mon, 21 May 2018 16:18:12 -0700

On Fri, May 18, 2018 at 07:21:09PM -0700, Andrey Ignatov wrote:
> In addition to already existing BPF hooks for sys_bind and sys_connect,
> the patch provides new hooks for sys_sendmsg.
> 
> It leverages existing BPF program type `BPF_PROG_TYPE_CGROUP_SOCK_ADDR`
> that provides access to socket itlself (properties like family, type,
> protocol) and user-passed `struct sockaddr *` so that BPF program can
> override destination IP and port for system calls such as sendto(2) or
> sendmsg(2) and/or assign source IP to the socket.
> 
> The hooks are implemented as two new attach types:
> `BPF_CGROUP_UDP4_SENDMSG` and `BPF_CGROUP_UDP6_SENDMSG` for UDPv4 and
> UDPv6 correspondingly.
> 
> UDPv4 and UDPv6 separate attach types for same reason as sys_bind and
> sys_connect hooks, i.e. to prevent reading from / writing to e.g.
> user_ip6 fields when user passes sockaddr_in since it'd be out-of-bound.
> 
> The difference with already existing hooks is sys_sendmsg are
> implemented only for unconnected UDP.
> 
> For TCP it doesn't make sense to change user-provided `struct sockaddr *`
> at sendto(2)/sendmsg(2) time since socket either was already connected
> and has source/destination set or wasn't connected and call to
> sendto(2)/sendmsg(2) would lead to ENOTCONN anyway.
> 
> Connected UDP is already handled by sys_connect hooks that can override
> source/destination at connect time and use fast-path later, i.e. these
> hooks don't affect UDP fast-path.
> 
> Rewriting source IP is implemented differently than that in sys_connect
> hooks. When sys_sendmsg is used with unconnected UDP it doesn't work to
> just bind socket to desired local IP address since source IP can be set
> on per-packet basis by using ancillary data (cmsg(3)). So no matter if
> socket is bound or not, source IP has to be rewritten on every call to
> sys_sendmsg.
> 
> To do so two new fields are added to UAPI `struct bpf_sock_addr`;
> * `msg_src_ip4` to set source IPv4 for UDPv4;
> * `msg_src_ip6` to set source IPv6 for UDPv6.
> 
> Signed-off-by: Andrey Ignatov <r...@fb.com>
> Acked-by: Alexei Starovoitov <a...@kernel.org>
> ---
>  include/linux/bpf-cgroup.h | 23 +++++++++++++++++------
>  include/linux/filter.h     |  1 +
>  include/uapi/linux/bpf.h   |  8 ++++++++
>  kernel/bpf/cgroup.c        | 11 ++++++++++-
>  kernel/bpf/syscall.c       |  8 ++++++++
>  net/core/filter.c          | 39 +++++++++++++++++++++++++++++++++++++++
>  net/ipv4/udp.c             | 20 ++++++++++++++++++--
>  net/ipv6/udp.c             | 17 +++++++++++++++++
>  8 files changed, 118 insertions(+), 9 deletions(-)
> 
> diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
> index 30d15e6..46f01ba 100644
> --- a/include/linux/bpf-cgroup.h
> +++ b/include/linux/bpf-cgroup.h
> @@ -66,7 +66,8 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
>  
>  int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
>                                     struct sockaddr *uaddr,
> -                                   enum bpf_attach_type type);
> +                                   enum bpf_attach_type type,
> +                                   void *t_ctx);
>  
>  int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
>                                    struct bpf_sock_ops_kern *sock_ops,
> @@ -120,16 +121,18 @@ int __cgroup_bpf_check_dev_permission(short dev_type, 
> u32 major, u32 minor,
>  ({                                                                          \
>       int __ret = 0;                                                         \
>       if (cgroup_bpf_enabled)                                                \
> -             __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type);    \
> +             __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type,     \
> +                                                       NULL);               \
>       __ret;                                                                 \
>  })
>  
> -#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type)                        \
> +#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx)                 \
>  ({                                                                          \
>       int __ret = 0;                                                         \
>       if (cgroup_bpf_enabled) {                                              \
>               lock_sock(sk);                                                 \
> -             __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type);    \
> +             __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type,     \
> +                                                       t_ctx);              \
>               release_sock(sk);                                              \
>       }                                                                      \
>       __ret;                                                                 \
> @@ -151,10 +154,16 @@ int __cgroup_bpf_check_dev_permission(short dev_type, 
> u32 major, u32 minor,
>       BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_CONNECT)
>  
>  #define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr)                   \
> -     BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET4_CONNECT)
> +     BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET4_CONNECT, NULL)
>  
>  #define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr)                   \
> -     BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_CONNECT)
> +     BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_CONNECT, NULL)
> +
> +#define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, t_ctx)              
>        \
> +     BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP4_SENDMSG, t_ctx)
> +
> +#define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx)              
>        \
> +     BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP6_SENDMSG, t_ctx)
>  
>  #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops)                               
>        \
>  ({                                                                          \
> @@ -197,6 +206,8 @@ static inline int cgroup_bpf_inherit(struct cgroup *cgrp) 
> { return 0; }
>  #define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) ({ 0; })
>  #define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr) ({ 0; })
>  #define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr) ({ 0; })
> +#define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr) ({ 0; })
> +#define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr) ({ 0; })
>  #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; })
>  #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; })
>  
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index d358d18..d90abda 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -1010,6 +1010,7 @@ struct bpf_sock_addr_kern {
>        * only two (src and dst) are available at convert_ctx_access time
>        */
>       u64 tmp_reg;
> +     void *t_ctx;    /* Attach type specific context. */
>  };
>  
>  struct bpf_sock_ops_kern {
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 97446bb..b70ad2c 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -158,6 +158,8 @@ enum bpf_attach_type {
>       BPF_CGROUP_INET6_CONNECT,
>       BPF_CGROUP_INET4_POST_BIND,
>       BPF_CGROUP_INET6_POST_BIND,
> +     BPF_CGROUP_UDP4_SENDMSG,
> +     BPF_CGROUP_UDP6_SENDMSG,
>       __MAX_BPF_ATTACH_TYPE
>  };
>  
> @@ -2247,6 +2249,12 @@ struct bpf_sock_addr {
>       __u32 family;           /* Allows 4-byte read, but no write */
>       __u32 type;             /* Allows 4-byte read, but no write */
>       __u32 protocol;         /* Allows 4-byte read, but no write */
> +     __u32 msg_src_ip4;      /* Allows 1,2,4-byte read an 4-byte write.
> +                              * Stored in network byte order.
> +                              */
> +     __u32 msg_src_ip6[4];   /* Allows 1,2,4-byte read an 4-byte write.
> +                              * Stored in network byte order.
> +                              */
>  };
>  
>  /* User bpf_sock_ops struct to access socket values and specify request ops
> diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
> index 43171a0..f7c00bd 100644
> --- a/kernel/bpf/cgroup.c
> +++ b/kernel/bpf/cgroup.c
> @@ -500,6 +500,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
>   * @sk: sock struct that will use sockaddr
>   * @uaddr: sockaddr struct provided by user
>   * @type: The type of program to be exectuted
> + * @t_ctx: Pointer to attach type specific context
>   *
>   * socket is expected to be of type INET or INET6.
>   *
> @@ -508,12 +509,15 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
>   */
>  int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
>                                     struct sockaddr *uaddr,
> -                                   enum bpf_attach_type type)
> +                                   enum bpf_attach_type type,
> +                                   void *t_ctx)
>  {
>       struct bpf_sock_addr_kern ctx = {
>               .sk = sk,
>               .uaddr = uaddr,
> +             .t_ctx = t_ctx,
>       };
> +     struct sockaddr_storage unspec;
>       struct cgroup *cgrp;
>       int ret;
>  
> @@ -523,6 +527,11 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
>       if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
>               return 0;
>  
> +     if (!ctx.uaddr) {
> +             memset(&unspec, 0, sizeof(unspec));
> +             ctx.uaddr = (struct sockaddr *)&unspec;
> +     }
> +
>       cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
>       ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN);
>  
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index bfcde94..11a5a95 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -1247,6 +1247,8 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type 
> prog_type,
>               case BPF_CGROUP_INET6_BIND:
>               case BPF_CGROUP_INET4_CONNECT:
>               case BPF_CGROUP_INET6_CONNECT:
> +             case BPF_CGROUP_UDP4_SENDMSG:
> +             case BPF_CGROUP_UDP6_SENDMSG:
>                       return 0;
>               default:
>                       return -EINVAL;
> @@ -1563,6 +1565,8 @@ static int bpf_prog_attach(const union bpf_attr *attr)
>       case BPF_CGROUP_INET6_BIND:
>       case BPF_CGROUP_INET4_CONNECT:
>       case BPF_CGROUP_INET6_CONNECT:
> +     case BPF_CGROUP_UDP4_SENDMSG:
> +     case BPF_CGROUP_UDP6_SENDMSG:
>               ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
>               break;
>       case BPF_CGROUP_SOCK_OPS:
> @@ -1633,6 +1637,8 @@ static int bpf_prog_detach(const union bpf_attr *attr)
>       case BPF_CGROUP_INET6_BIND:
>       case BPF_CGROUP_INET4_CONNECT:
>       case BPF_CGROUP_INET6_CONNECT:
> +     case BPF_CGROUP_UDP4_SENDMSG:
> +     case BPF_CGROUP_UDP6_SENDMSG:
>               ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
>               break;
>       case BPF_CGROUP_SOCK_OPS:
> @@ -1690,6 +1696,8 @@ static int bpf_prog_query(const union bpf_attr *attr,
>       case BPF_CGROUP_INET6_POST_BIND:
>       case BPF_CGROUP_INET4_CONNECT:
>       case BPF_CGROUP_INET6_CONNECT:
> +     case BPF_CGROUP_UDP4_SENDMSG:
> +     case BPF_CGROUP_UDP6_SENDMSG:
>       case BPF_CGROUP_SOCK_OPS:
>       case BPF_CGROUP_DEVICE:
>               break;
> diff --git a/net/core/filter.c b/net/core/filter.c
> index aec5eba..f696dc9 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -5010,6 +5010,7 @@ static bool sock_addr_is_valid_access(int off, int size,
>               switch (prog->expected_attach_type) {
>               case BPF_CGROUP_INET4_BIND:
>               case BPF_CGROUP_INET4_CONNECT:
> +             case BPF_CGROUP_UDP4_SENDMSG:
>                       break;
>               default:
>                       return false;
> @@ -5019,6 +5020,24 @@ static bool sock_addr_is_valid_access(int off, int 
> size,
>               switch (prog->expected_attach_type) {
>               case BPF_CGROUP_INET6_BIND:
>               case BPF_CGROUP_INET6_CONNECT:
> +             case BPF_CGROUP_UDP6_SENDMSG:
> +                     break;
> +             default:
> +                     return false;
> +             }
> +             break;
> +     case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4):
> +             switch (prog->expected_attach_type) {
> +             case BPF_CGROUP_UDP4_SENDMSG:
> +                     break;
> +             default:
> +                     return false;
> +             }
> +             break;
> +     case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
> +                             msg_src_ip6[3]):
> +             switch (prog->expected_attach_type) {
> +             case BPF_CGROUP_UDP6_SENDMSG:
>                       break;
>               default:
>                       return false;
> @@ -5029,6 +5048,9 @@ static bool sock_addr_is_valid_access(int off, int size,
>       switch (off) {
>       case bpf_ctx_range(struct bpf_sock_addr, user_ip4):
>       case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]):
> +     case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4):
> +     case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
> +                             msg_src_ip6[3]):
>               /* Only narrow read access allowed for now. */
>               if (type == BPF_READ) {
>                       bpf_ctx_record_field_size(info, size_default);
> @@ -5783,6 +5805,23 @@ static u32 sock_addr_convert_ctx_access(enum 
> bpf_access_type type,
>               *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg,
>                                       SK_FL_PROTO_SHIFT);
>               break;
> +
> +     case offsetof(struct bpf_sock_addr, msg_src_ip4):
> +             /* Treat t_ctx as struct in_addr for msg_src_ip4. */
> +             SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
> +                     struct bpf_sock_addr_kern, struct in_addr, t_ctx,
> +                     s_addr, BPF_SIZE(si->code), 0, tmp_reg);
> +             break;
> +
> +     case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
> +                             msg_src_ip6[3]):
> +             off = si->off;
> +             off -= offsetof(struct bpf_sock_addr, msg_src_ip6[0]);
> +             /* Treat t_ctx as struct in6_addr for msg_src_ip6. */
> +             SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
> +                     struct bpf_sock_addr_kern, struct in6_addr, t_ctx,
> +                     s6_addr32[0], BPF_SIZE(si->code), off, tmp_reg);
> +             break;
>       }
>  
>       return insn - insn_buf;
> diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
> index ff4d4ba..a1f9ba2 100644
> --- a/net/ipv4/udp.c
> +++ b/net/ipv4/udp.c
> @@ -900,6 +900,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, 
> size_t len)
>  {
>       struct inet_sock *inet = inet_sk(sk);
>       struct udp_sock *up = udp_sk(sk);
> +     DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
>       struct flowi4 fl4_stack;
>       struct flowi4 *fl4;
>       int ulen = len;
> @@ -954,8 +955,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, 
> size_t len)
>       /*
>        *      Get and verify the address.
>        */
> -     if (msg->msg_name) {
> -             DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
> +     if (usin) {
>               if (msg->msg_namelen < sizeof(*usin))
>                       return -EINVAL;
>               if (usin->sin_family != AF_INET) {
> @@ -1009,6 +1009,22 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, 
> size_t len)
>               rcu_read_unlock();
>       }
>  
> +     if (!connected) {
> +             err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk,
> +                                         (struct sockaddr *)usin, &ipc.addr);
> +             if (err)
> +                     goto out_free;
> +             if (usin) {
> +                     if (usin->sin_port == 0) {
> +                             /* BPF program set invalid port. Reject it. */
> +                             err = -EINVAL;
> +                             goto out_free;
> +                     }
> +                     daddr = usin->sin_addr.s_addr;
> +                     dport = usin->sin_port;
> +             }
> +     }
> +
>       saddr = ipc.addr;
>       ipc.addr = faddr = daddr;
>  
> diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
> index 2839c1b..6f580ea 100644
> --- a/net/ipv6/udp.c
> +++ b/net/ipv6/udp.c
> @@ -1315,6 +1315,22 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, 
> size_t len)
>               fl6.saddr = np->saddr;
>       fl6.fl6_sport = inet->inet_sport;
>  
> +     if (!connected) {
> +             err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk,
> +                                        (struct sockaddr *)sin6, &fl6.saddr);
> +             if (err)
> +                     goto out_no_dst;
> +             if (sin6) {
> +                     if (sin6->sin6_port == 0) {
> +                             /* BPF program set invalid port. Reject it. */
> +                             err = -EINVAL;
> +                             goto out_no_dst;
> +                     }
> +                     fl6.fl6_dport = sin6->sin6_port;
> +                     fl6.daddr = sin6->sin6_addr;
Could the bpf_prog change sin6 to a v4mapped address?


> +             }
> +     }
> +
>       final_p = fl6_update_dst(&fl6, opt, &final);
It seems fl6_update_dst() may update fl6->daddr.
Is it fine (or inline with the expectation after running
a bpf_prog that has changed user_ip6)?

>       if (final_p)
>               connected = false;
> @@ -1394,6 +1410,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, 
> size_t len)
>  
>  out:
>       dst_release(dst);
> +out_no_dst:
A nit.  If dst is init to NULL, can a new exit label
be avoided?


>       fl6_sock_release(flowlabel);
>       txopt_put(opt_to_free);
>       if (!err)
> -- 
> 2.9.5
>

Re: [PATCH bpf-next 1/5] bpf: Hooks for sys_sendmsg

Reply via email to