On Fri, May 18, 2018 at 07:21:09PM -0700, Andrey Ignatov wrote: > In addition to already existing BPF hooks for sys_bind and sys_connect, > the patch provides new hooks for sys_sendmsg. > > It leverages existing BPF program type `BPF_PROG_TYPE_CGROUP_SOCK_ADDR` > that provides access to socket itlself (properties like family, type, > protocol) and user-passed `struct sockaddr *` so that BPF program can > override destination IP and port for system calls such as sendto(2) or > sendmsg(2) and/or assign source IP to the socket. > > The hooks are implemented as two new attach types: > `BPF_CGROUP_UDP4_SENDMSG` and `BPF_CGROUP_UDP6_SENDMSG` for UDPv4 and > UDPv6 correspondingly. > > UDPv4 and UDPv6 separate attach types for same reason as sys_bind and > sys_connect hooks, i.e. to prevent reading from / writing to e.g. > user_ip6 fields when user passes sockaddr_in since it'd be out-of-bound. > > The difference with already existing hooks is sys_sendmsg are > implemented only for unconnected UDP. > > For TCP it doesn't make sense to change user-provided `struct sockaddr *` > at sendto(2)/sendmsg(2) time since socket either was already connected > and has source/destination set or wasn't connected and call to > sendto(2)/sendmsg(2) would lead to ENOTCONN anyway. > > Connected UDP is already handled by sys_connect hooks that can override > source/destination at connect time and use fast-path later, i.e. these > hooks don't affect UDP fast-path. > > Rewriting source IP is implemented differently than that in sys_connect > hooks. When sys_sendmsg is used with unconnected UDP it doesn't work to > just bind socket to desired local IP address since source IP can be set > on per-packet basis by using ancillary data (cmsg(3)). So no matter if > socket is bound or not, source IP has to be rewritten on every call to > sys_sendmsg. > > To do so two new fields are added to UAPI `struct bpf_sock_addr`; > * `msg_src_ip4` to set source IPv4 for UDPv4; > * `msg_src_ip6` to set source IPv6 for UDPv6. > > Signed-off-by: Andrey Ignatov <r...@fb.com> > Acked-by: Alexei Starovoitov <a...@kernel.org> > --- > include/linux/bpf-cgroup.h | 23 +++++++++++++++++------ > include/linux/filter.h | 1 + > include/uapi/linux/bpf.h | 8 ++++++++ > kernel/bpf/cgroup.c | 11 ++++++++++- > kernel/bpf/syscall.c | 8 ++++++++ > net/core/filter.c | 39 +++++++++++++++++++++++++++++++++++++++ > net/ipv4/udp.c | 20 ++++++++++++++++++-- > net/ipv6/udp.c | 17 +++++++++++++++++ > 8 files changed, 118 insertions(+), 9 deletions(-) > > diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h > index 30d15e6..46f01ba 100644 > --- a/include/linux/bpf-cgroup.h > +++ b/include/linux/bpf-cgroup.h > @@ -66,7 +66,8 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk, > > int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, > struct sockaddr *uaddr, > - enum bpf_attach_type type); > + enum bpf_attach_type type, > + void *t_ctx); > > int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, > struct bpf_sock_ops_kern *sock_ops, > @@ -120,16 +121,18 @@ int __cgroup_bpf_check_dev_permission(short dev_type, > u32 major, u32 minor, > ({ \ > int __ret = 0; \ > if (cgroup_bpf_enabled) \ > - __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type); \ > + __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type, \ > + NULL); \ > __ret; \ > }) > > -#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type) \ > +#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx) \ > ({ \ > int __ret = 0; \ > if (cgroup_bpf_enabled) { \ > lock_sock(sk); \ > - __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type); \ > + __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type, \ > + t_ctx); \ > release_sock(sk); \ > } \ > __ret; \ > @@ -151,10 +154,16 @@ int __cgroup_bpf_check_dev_permission(short dev_type, > u32 major, u32 minor, > BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_CONNECT) > > #define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) \ > - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET4_CONNECT) > + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET4_CONNECT, NULL) > > #define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr) \ > - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_CONNECT) > + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_CONNECT, NULL) > + > +#define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, t_ctx) > \ > + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP4_SENDMSG, t_ctx) > + > +#define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx) > \ > + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP6_SENDMSG, t_ctx) > > #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) > \ > ({ \ > @@ -197,6 +206,8 @@ static inline int cgroup_bpf_inherit(struct cgroup *cgrp) > { return 0; } > #define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) ({ 0; }) > #define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr) ({ 0; }) > #define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr) ({ 0; }) > +#define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr) ({ 0; }) > +#define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr) ({ 0; }) > #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) > #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; }) > > diff --git a/include/linux/filter.h b/include/linux/filter.h > index d358d18..d90abda 100644 > --- a/include/linux/filter.h > +++ b/include/linux/filter.h > @@ -1010,6 +1010,7 @@ struct bpf_sock_addr_kern { > * only two (src and dst) are available at convert_ctx_access time > */ > u64 tmp_reg; > + void *t_ctx; /* Attach type specific context. */ > }; > > struct bpf_sock_ops_kern { > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index 97446bb..b70ad2c 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -158,6 +158,8 @@ enum bpf_attach_type { > BPF_CGROUP_INET6_CONNECT, > BPF_CGROUP_INET4_POST_BIND, > BPF_CGROUP_INET6_POST_BIND, > + BPF_CGROUP_UDP4_SENDMSG, > + BPF_CGROUP_UDP6_SENDMSG, > __MAX_BPF_ATTACH_TYPE > }; > > @@ -2247,6 +2249,12 @@ struct bpf_sock_addr { > __u32 family; /* Allows 4-byte read, but no write */ > __u32 type; /* Allows 4-byte read, but no write */ > __u32 protocol; /* Allows 4-byte read, but no write */ > + __u32 msg_src_ip4; /* Allows 1,2,4-byte read an 4-byte write. > + * Stored in network byte order. > + */ > + __u32 msg_src_ip6[4]; /* Allows 1,2,4-byte read an 4-byte write. > + * Stored in network byte order. > + */ > }; > > /* User bpf_sock_ops struct to access socket values and specify request ops > diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c > index 43171a0..f7c00bd 100644 > --- a/kernel/bpf/cgroup.c > +++ b/kernel/bpf/cgroup.c > @@ -500,6 +500,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); > * @sk: sock struct that will use sockaddr > * @uaddr: sockaddr struct provided by user > * @type: The type of program to be exectuted > + * @t_ctx: Pointer to attach type specific context > * > * socket is expected to be of type INET or INET6. > * > @@ -508,12 +509,15 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); > */ > int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, > struct sockaddr *uaddr, > - enum bpf_attach_type type) > + enum bpf_attach_type type, > + void *t_ctx) > { > struct bpf_sock_addr_kern ctx = { > .sk = sk, > .uaddr = uaddr, > + .t_ctx = t_ctx, > }; > + struct sockaddr_storage unspec; > struct cgroup *cgrp; > int ret; > > @@ -523,6 +527,11 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, > if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) > return 0; > > + if (!ctx.uaddr) { > + memset(&unspec, 0, sizeof(unspec)); > + ctx.uaddr = (struct sockaddr *)&unspec; > + } > + > cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); > ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN); > > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > index bfcde94..11a5a95 100644 > --- a/kernel/bpf/syscall.c > +++ b/kernel/bpf/syscall.c > @@ -1247,6 +1247,8 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type > prog_type, > case BPF_CGROUP_INET6_BIND: > case BPF_CGROUP_INET4_CONNECT: > case BPF_CGROUP_INET6_CONNECT: > + case BPF_CGROUP_UDP4_SENDMSG: > + case BPF_CGROUP_UDP6_SENDMSG: > return 0; > default: > return -EINVAL; > @@ -1563,6 +1565,8 @@ static int bpf_prog_attach(const union bpf_attr *attr) > case BPF_CGROUP_INET6_BIND: > case BPF_CGROUP_INET4_CONNECT: > case BPF_CGROUP_INET6_CONNECT: > + case BPF_CGROUP_UDP4_SENDMSG: > + case BPF_CGROUP_UDP6_SENDMSG: > ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; > break; > case BPF_CGROUP_SOCK_OPS: > @@ -1633,6 +1637,8 @@ static int bpf_prog_detach(const union bpf_attr *attr) > case BPF_CGROUP_INET6_BIND: > case BPF_CGROUP_INET4_CONNECT: > case BPF_CGROUP_INET6_CONNECT: > + case BPF_CGROUP_UDP4_SENDMSG: > + case BPF_CGROUP_UDP6_SENDMSG: > ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; > break; > case BPF_CGROUP_SOCK_OPS: > @@ -1690,6 +1696,8 @@ static int bpf_prog_query(const union bpf_attr *attr, > case BPF_CGROUP_INET6_POST_BIND: > case BPF_CGROUP_INET4_CONNECT: > case BPF_CGROUP_INET6_CONNECT: > + case BPF_CGROUP_UDP4_SENDMSG: > + case BPF_CGROUP_UDP6_SENDMSG: > case BPF_CGROUP_SOCK_OPS: > case BPF_CGROUP_DEVICE: > break; > diff --git a/net/core/filter.c b/net/core/filter.c > index aec5eba..f696dc9 100644 > --- a/net/core/filter.c > +++ b/net/core/filter.c > @@ -5010,6 +5010,7 @@ static bool sock_addr_is_valid_access(int off, int size, > switch (prog->expected_attach_type) { > case BPF_CGROUP_INET4_BIND: > case BPF_CGROUP_INET4_CONNECT: > + case BPF_CGROUP_UDP4_SENDMSG: > break; > default: > return false; > @@ -5019,6 +5020,24 @@ static bool sock_addr_is_valid_access(int off, int > size, > switch (prog->expected_attach_type) { > case BPF_CGROUP_INET6_BIND: > case BPF_CGROUP_INET6_CONNECT: > + case BPF_CGROUP_UDP6_SENDMSG: > + break; > + default: > + return false; > + } > + break; > + case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4): > + switch (prog->expected_attach_type) { > + case BPF_CGROUP_UDP4_SENDMSG: > + break; > + default: > + return false; > + } > + break; > + case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0], > + msg_src_ip6[3]): > + switch (prog->expected_attach_type) { > + case BPF_CGROUP_UDP6_SENDMSG: > break; > default: > return false; > @@ -5029,6 +5048,9 @@ static bool sock_addr_is_valid_access(int off, int size, > switch (off) { > case bpf_ctx_range(struct bpf_sock_addr, user_ip4): > case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): > + case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4): > + case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0], > + msg_src_ip6[3]): > /* Only narrow read access allowed for now. */ > if (type == BPF_READ) { > bpf_ctx_record_field_size(info, size_default); > @@ -5783,6 +5805,23 @@ static u32 sock_addr_convert_ctx_access(enum > bpf_access_type type, > *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, > SK_FL_PROTO_SHIFT); > break; > + > + case offsetof(struct bpf_sock_addr, msg_src_ip4): > + /* Treat t_ctx as struct in_addr for msg_src_ip4. */ > + SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( > + struct bpf_sock_addr_kern, struct in_addr, t_ctx, > + s_addr, BPF_SIZE(si->code), 0, tmp_reg); > + break; > + > + case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0], > + msg_src_ip6[3]): > + off = si->off; > + off -= offsetof(struct bpf_sock_addr, msg_src_ip6[0]); > + /* Treat t_ctx as struct in6_addr for msg_src_ip6. */ > + SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( > + struct bpf_sock_addr_kern, struct in6_addr, t_ctx, > + s6_addr32[0], BPF_SIZE(si->code), off, tmp_reg); > + break; > } > > return insn - insn_buf; > diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c > index ff4d4ba..a1f9ba2 100644 > --- a/net/ipv4/udp.c > +++ b/net/ipv4/udp.c > @@ -900,6 +900,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, > size_t len) > { > struct inet_sock *inet = inet_sk(sk); > struct udp_sock *up = udp_sk(sk); > + DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name); > struct flowi4 fl4_stack; > struct flowi4 *fl4; > int ulen = len; > @@ -954,8 +955,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, > size_t len) > /* > * Get and verify the address. > */ > - if (msg->msg_name) { > - DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name); > + if (usin) { > if (msg->msg_namelen < sizeof(*usin)) > return -EINVAL; > if (usin->sin_family != AF_INET) { > @@ -1009,6 +1009,22 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, > size_t len) > rcu_read_unlock(); > } > > + if (!connected) { > + err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, > + (struct sockaddr *)usin, &ipc.addr); > + if (err) > + goto out_free; > + if (usin) { > + if (usin->sin_port == 0) { > + /* BPF program set invalid port. Reject it. */ > + err = -EINVAL; > + goto out_free; > + } > + daddr = usin->sin_addr.s_addr; > + dport = usin->sin_port; > + } > + } > + > saddr = ipc.addr; > ipc.addr = faddr = daddr; > > diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c > index 2839c1b..6f580ea 100644 > --- a/net/ipv6/udp.c > +++ b/net/ipv6/udp.c > @@ -1315,6 +1315,22 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, > size_t len) > fl6.saddr = np->saddr; > fl6.fl6_sport = inet->inet_sport; > > + if (!connected) { > + err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, > + (struct sockaddr *)sin6, &fl6.saddr); > + if (err) > + goto out_no_dst; > + if (sin6) { > + if (sin6->sin6_port == 0) { > + /* BPF program set invalid port. Reject it. */ > + err = -EINVAL; > + goto out_no_dst; > + } > + fl6.fl6_dport = sin6->sin6_port; > + fl6.daddr = sin6->sin6_addr; Could the bpf_prog change sin6 to a v4mapped address?
> + } > + } > + > final_p = fl6_update_dst(&fl6, opt, &final); It seems fl6_update_dst() may update fl6->daddr. Is it fine (or inline with the expectation after running a bpf_prog that has changed user_ip6)? > if (final_p) > connected = false; > @@ -1394,6 +1410,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, > size_t len) > > out: > dst_release(dst); > +out_no_dst: A nit. If dst is init to NULL, can a new exit label be avoided? > fl6_sock_release(flowlabel); > txopt_put(opt_to_free); > if (!err) > -- > 2.9.5 >