This patch adds a new BPF helper function, sk_lookup() which allows BPF programs to find out if there is a socket listening on this host, and returns a socket pointer which the BPF program can then access to determine, for instance, whether to forward or drop traffic. sk_lookup() takes a reference on the socket, so when a BPF program makes use of this function, it must subsequently pass the returned pointer into the newly added sk_release() to return the reference.
By way of example, the following pseudocode would filter inbound connections at XDP if there is no corresponding service listening for the traffic: struct bpf_sock_tuple tuple; struct bpf_sock_ops *sk; populate_tuple(ctx, &tuple); // Extract the 5tuple from the packet sk = bpf_sk_lookup(ctx, &tuple, sizeof tuple, netns, 0); if (!sk) { // Couldn't find a socket listening for this traffic. Drop. return TC_ACT_SHOT; } bpf_sk_release(sk, 0); return TC_ACT_OK; Signed-off-by: Joe Stringer <j...@wand.net.nz> --- include/uapi/linux/bpf.h | 39 +++++++++++- kernel/bpf/verifier.c | 8 ++- net/core/filter.c | 102 ++++++++++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 40 +++++++++++- tools/testing/selftests/bpf/bpf_helpers.h | 7 ++ 5 files changed, 193 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d615c777b573..29f38838dbca 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1828,6 +1828,25 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * + * struct bpf_sock_ops *bpf_sk_lookup(ctx, tuple, tuple_size, netns, flags) + * Decription + * Look for socket matching 'tuple'. The return value must be checked, + * and if non-NULL, released via bpf_sk_release(). + * @ctx: pointer to ctx + * @tuple: pointer to struct bpf_sock_tuple + * @tuple_size: size of the tuple + * @flags: flags value + * Return + * pointer to socket ops on success, or + * NULL in case of failure + * + * int bpf_sk_release(sock, flags) + * Description + * Release the reference held by 'sock'. + * @sock: Pointer reference to release. Must be found via bpf_sk_lookup(). + * @flags: flags value + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -1898,7 +1917,9 @@ union bpf_attr { FN(xdp_adjust_tail), \ FN(skb_get_xfrm_state), \ FN(get_stack), \ - FN(skb_load_bytes_relative), + FN(skb_load_bytes_relative), \ + FN(sk_lookup), \ + FN(sk_release), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -2060,6 +2081,22 @@ struct bpf_sock { */ }; +struct bpf_sock_tuple { + union { + __be32 ipv6[4]; + __be32 ipv4; + } saddr; + union { + __be32 ipv6[4]; + __be32 ipv4; + } daddr; + __be16 sport; + __be16 dport; + __u32 dst_if; + __u8 family; + __u8 proto; +}; + #define XDP_PACKET_HEADROOM 256 /* User return codes for XDP prog type. diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 92b9a5dc465a..579012c483e4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -153,6 +153,12 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = { * PTR_TO_MAP_VALUE, PTR_TO_SOCKET_OR_NULL becomes PTR_TO_SOCKET when the type * passes through a NULL-check conditional. For the branch wherein the state is * changed to CONST_IMM, the verifier releases the reference. + * + * For each helper function that allocates a reference, such as bpf_sk_lookup(), + * there is a corresponding release function, such as bpf_sk_release(). When + * a reference type passes into the release function, the verifier also releases + * the reference. If any unchecked or unreleased reference remains at the end of + * the program, the verifier rejects it. */ /* verifier_state + insn_idx are pushed to stack when branch is encountered */ @@ -277,7 +283,7 @@ static bool arg_type_is_refcounted(enum bpf_arg_type type) */ static bool is_release_function(enum bpf_func_id func_id) { - return false; + return func_id == BPF_FUNC_sk_release; } /* string representation of 'enum bpf_reg_type' */ diff --git a/net/core/filter.c b/net/core/filter.c index 4c35152fb3a8..751c255d17d3 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -58,8 +58,12 @@ #include <net/busy_poll.h> #include <net/tcp.h> #include <net/xfrm.h> +#include <net/udp.h> #include <linux/bpf_trace.h> #include <net/xdp_sock.h> +#include <net/inet_hashtables.h> +#include <net/inet6_hashtables.h> +#include <net/net_namespace.h> /** * sk_filter_trim_cap - run a packet through a socket filter @@ -4032,6 +4036,96 @@ static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = { }; #endif +struct sock * +sk_lookup(struct net *net, struct bpf_sock_tuple *tuple) { + int dst_if = (int)tuple->dst_if; + struct in6_addr *src6; + struct in6_addr *dst6; + + if (tuple->family == AF_INET6) { + src6 = (struct in6_addr *)&tuple->saddr.ipv6; + dst6 = (struct in6_addr *)&tuple->daddr.ipv6; + } else if (tuple->family != AF_INET) { + return ERR_PTR(-EOPNOTSUPP); + } + + if (tuple->proto == IPPROTO_TCP) { + if (tuple->family == AF_INET) + return inet_lookup(net, &tcp_hashinfo, NULL, 0, + tuple->saddr.ipv4, tuple->sport, + tuple->daddr.ipv4, tuple->dport, + dst_if); + else + return inet6_lookup(net, &tcp_hashinfo, NULL, 0, + src6, tuple->sport, + dst6, tuple->dport, dst_if); + } else if (tuple->proto == IPPROTO_UDP) { + if (tuple->family == AF_INET) + return udp4_lib_lookup(net, tuple->saddr.ipv4, + tuple->sport, tuple->daddr.ipv4, + tuple->dport, dst_if); + else + return udp6_lib_lookup(net, src6, tuple->sport, + dst6, tuple->dport, dst_if); + } else { + return ERR_PTR(-EOPNOTSUPP); + } + + return NULL; +} + +BPF_CALL_5(bpf_sk_lookup, struct sk_buff *, skb, + struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) +{ + struct net *caller_net = dev_net(skb->dev); + struct sock *sk = NULL; + struct net *net; + + /* XXX: Perform verification-time checking of tuple size? */ + if (unlikely(len != sizeof(struct bpf_sock_tuple) || flags)) + goto out; + + net = get_net_ns_by_id(caller_net, netns_id); + if (unlikely(!net)) + goto out; + + sk = sk_lookup(net, tuple); + put_net(net); + if (IS_ERR_OR_NULL(sk)) + sk = NULL; + else + sk = sk_to_full_sk(sk); +out: + return (unsigned long) sk; +} + +static const struct bpf_func_proto bpf_sk_lookup_proto = { + .func = bpf_sk_lookup, + .gpl_only = false, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_sk_release, struct sock *, sk, u64, flags) +{ + sock_gen_put(sk); + if (unlikely(flags)) + return -EINVAL; + return 0; +} + +static const struct bpf_func_proto bpf_sk_release_proto = { + .func = bpf_sk_release, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_SOCKET, + .arg2_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) { @@ -4181,6 +4275,10 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_skb_get_xfrm_state: return &bpf_skb_get_xfrm_state_proto; #endif + case BPF_FUNC_sk_lookup: + return &bpf_sk_lookup_proto; + case BPF_FUNC_sk_release: + return &bpf_sk_release_proto; default: return bpf_base_func_proto(func_id); } @@ -4292,6 +4390,10 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_socket_uid_proto; case BPF_FUNC_sk_redirect_map: return &bpf_sk_redirect_map_proto; + case BPF_FUNC_sk_lookup: + return &bpf_sk_lookup_proto; + case BPF_FUNC_sk_release: + return &bpf_sk_release_proto; default: return bpf_base_func_proto(func_id); } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index fff51c187d1e..29f38838dbca 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -117,6 +117,7 @@ enum bpf_map_type { BPF_MAP_TYPE_DEVMAP, BPF_MAP_TYPE_SOCKMAP, BPF_MAP_TYPE_CPUMAP, + BPF_MAP_TYPE_XSKMAP, }; enum bpf_prog_type { @@ -1827,6 +1828,25 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * + * struct bpf_sock_ops *bpf_sk_lookup(ctx, tuple, tuple_size, netns, flags) + * Decription + * Look for socket matching 'tuple'. The return value must be checked, + * and if non-NULL, released via bpf_sk_release(). + * @ctx: pointer to ctx + * @tuple: pointer to struct bpf_sock_tuple + * @tuple_size: size of the tuple + * @flags: flags value + * Return + * pointer to socket ops on success, or + * NULL in case of failure + * + * int bpf_sk_release(sock, flags) + * Description + * Release the reference held by 'sock'. + * @sock: Pointer reference to release. Must be found via bpf_sk_lookup(). + * @flags: flags value + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -1897,7 +1917,9 @@ union bpf_attr { FN(xdp_adjust_tail), \ FN(skb_get_xfrm_state), \ FN(get_stack), \ - FN(skb_load_bytes_relative), + FN(skb_load_bytes_relative), \ + FN(sk_lookup), \ + FN(sk_release), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -2059,6 +2081,22 @@ struct bpf_sock { */ }; +struct bpf_sock_tuple { + union { + __be32 ipv6[4]; + __be32 ipv4; + } saddr; + union { + __be32 ipv6[4]; + __be32 ipv4; + } daddr; + __be16 sport; + __be16 dport; + __u32 dst_if; + __u8 family; + __u8 proto; +}; + #define XDP_PACKET_HEADROOM 256 /* User return codes for XDP prog type. diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h index 265f8e0e8ada..4dc311ea0c16 100644 --- a/tools/testing/selftests/bpf/bpf_helpers.h +++ b/tools/testing/selftests/bpf/bpf_helpers.h @@ -103,6 +103,13 @@ static int (*bpf_skb_get_xfrm_state)(void *ctx, int index, void *state, (void *) BPF_FUNC_skb_get_xfrm_state; static int (*bpf_get_stack)(void *ctx, void *buf, int size, int flags) = (void *) BPF_FUNC_get_stack; +static struct bpf_sock *(*bpf_sk_lookup)(void *ctx, + struct bpf_sock_tuple *tuple, + int size, unsigned int netns_id, + unsigned long long flags) = + (void *) BPF_FUNC_sk_lookup; +static int (*bpf_sk_release)(struct bpf_sock *sk, unsigned long long flags) = + (void *) BPF_FUNC_sk_release; /* llvm builtin functions that eBPF C program may use to * emit BPF_LD_ABS and BPF_LD_IND instructions -- 2.14.1