On 03/05/18 05:36, Alexei Starovoitov wrote:
> bpfilter.ko consists of bpfilter_kern.c (normal kernel module code)
> and user mode helper code that is embedded into bpfilter.ko
>
> The steps to build bpfilter.ko are the following:
> - main.c is compiled by HOSTCC into the bpfilter_umh elf executable file
> - with quite a bit of objcopy and Makefile magic the bpfilter_umh elf file
>   is converted into bpfilter_umh.o object file
>   with _binary_net_bpfilter_bpfilter_umh_start and _end symbols
>   Example:
>   $ nm ./bld_x64/net/bpfilter/bpfilter_umh.o
>   0000000000004cf8 T _binary_net_bpfilter_bpfilter_umh_end
>   0000000000004cf8 A _binary_net_bpfilter_bpfilter_umh_size
>   0000000000000000 T _binary_net_bpfilter_bpfilter_umh_start
> - bpfilter_umh.o and bpfilter_kern.o are linked together into bpfilter.ko
>
> bpfilter_kern.c is a normal kernel module code that calls
> the fork_usermode_blob() helper to execute part of its own data
> as a user mode process.
>
> Notice that _binary_net_bpfilter_bpfilter_umh_start - end
> is placed into .init.rodata section, so it's freed as soon as __init
> function of bpfilter.ko is finished.
> As part of __init the bpfilter.ko does first request/reply action
> via two unix pipe provided by fork_usermode_blob() helper to
> make sure that umh is healthy. If not it will kill it via pid.
>
> Later bpfilter_process_sockopt() will be called from bpfilter hooks
> in get/setsockopt() to pass iptable commands into umh via bpfilter.ko
>
> If admin does 'rmmod bpfilter' the __exit code bpfilter.ko will
> kill umh as well.
>
> Signed-off-by: Alexei Starovoitov <a...@kernel.org>
> ---
>  include/linux/bpfilter.h      | 15 +++++++
>  include/uapi/linux/bpfilter.h | 21 ++++++++++
>  net/Kconfig                   |  2 +
>  net/Makefile                  |  1 +
>  net/bpfilter/Kconfig          | 17 ++++++++
>  net/bpfilter/Makefile         | 24 +++++++++++
>  net/bpfilter/bpfilter_kern.c  | 93 
> +++++++++++++++++++++++++++++++++++++++++++
>  net/bpfilter/main.c           | 63 +++++++++++++++++++++++++++++
>  net/bpfilter/msgfmt.h         | 17 ++++++++
>  net/ipv4/Makefile             |  2 +
>  net/ipv4/bpfilter/Makefile    |  2 +
>  net/ipv4/bpfilter/sockopt.c   | 42 +++++++++++++++++++
>  net/ipv4/ip_sockglue.c        | 17 ++++++++
>  13 files changed, 316 insertions(+)
>  create mode 100644 include/linux/bpfilter.h
>  create mode 100644 include/uapi/linux/bpfilter.h
>  create mode 100644 net/bpfilter/Kconfig
>  create mode 100644 net/bpfilter/Makefile
>  create mode 100644 net/bpfilter/bpfilter_kern.c
>  create mode 100644 net/bpfilter/main.c
>  create mode 100644 net/bpfilter/msgfmt.h
>  create mode 100644 net/ipv4/bpfilter/Makefile
>  create mode 100644 net/ipv4/bpfilter/sockopt.c
>
> diff --git a/include/linux/bpfilter.h b/include/linux/bpfilter.h
> new file mode 100644
> index 000000000000..687b1760bb9f
> --- /dev/null
> +++ b/include/linux/bpfilter.h
> @@ -0,0 +1,15 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _LINUX_BPFILTER_H
> +#define _LINUX_BPFILTER_H
> +
> +#include <uapi/linux/bpfilter.h>
> +
> +struct sock;
> +int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char *optval,
> +                         unsigned int optlen);
> +int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char *optval,
> +                         int *optlen);
> +extern int (*bpfilter_process_sockopt)(struct sock *sk, int optname,
> +                                    char __user *optval,
> +                                    unsigned int optlen, bool is_set);
> +#endif
> diff --git a/include/uapi/linux/bpfilter.h b/include/uapi/linux/bpfilter.h
> new file mode 100644
> index 000000000000..2ec3cc99ea4c
> --- /dev/null
> +++ b/include/uapi/linux/bpfilter.h
> @@ -0,0 +1,21 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _UAPI_LINUX_BPFILTER_H
> +#define _UAPI_LINUX_BPFILTER_H
> +
> +#include <linux/if.h>
> +
> +enum {
> +     BPFILTER_IPT_SO_SET_REPLACE = 64,
> +     BPFILTER_IPT_SO_SET_ADD_COUNTERS = 65,
> +     BPFILTER_IPT_SET_MAX,
> +};
> +
> +enum {
> +     BPFILTER_IPT_SO_GET_INFO = 64,
> +     BPFILTER_IPT_SO_GET_ENTRIES = 65,
> +     BPFILTER_IPT_SO_GET_REVISION_MATCH = 66,
> +     BPFILTER_IPT_SO_GET_REVISION_TARGET = 67,
> +     BPFILTER_IPT_GET_MAX,
> +};
> +
> +#endif /* _UAPI_LINUX_BPFILTER_H */
> diff --git a/net/Kconfig b/net/Kconfig
> index b62089fb1332..ed6368b306fa 100644
> --- a/net/Kconfig
> +++ b/net/Kconfig
> @@ -201,6 +201,8 @@ source "net/bridge/netfilter/Kconfig"
>  
>  endif
>  
> +source "net/bpfilter/Kconfig"
> +
>  source "net/dccp/Kconfig"
>  source "net/sctp/Kconfig"
>  source "net/rds/Kconfig"
> diff --git a/net/Makefile b/net/Makefile
> index a6147c61b174..7f982b7682bd 100644
> --- a/net/Makefile
> +++ b/net/Makefile
> @@ -20,6 +20,7 @@ obj-$(CONFIG_TLS)           += tls/
>  obj-$(CONFIG_XFRM)           += xfrm/
>  obj-$(CONFIG_UNIX)           += unix/
>  obj-$(CONFIG_NET)            += ipv6/
> +obj-$(CONFIG_BPFILTER)               += bpfilter/
>  obj-$(CONFIG_PACKET)         += packet/
>  obj-$(CONFIG_NET_KEY)                += key/
>  obj-$(CONFIG_BRIDGE)         += bridge/
> diff --git a/net/bpfilter/Kconfig b/net/bpfilter/Kconfig
> new file mode 100644
> index 000000000000..782a732b9a5c
> --- /dev/null
> +++ b/net/bpfilter/Kconfig
> @@ -0,0 +1,17 @@
> +menuconfig BPFILTER
> +     bool "BPF based packet filtering framework (BPFILTER)"
> +     default n
> +     depends on NET && BPF
> +     help
> +       This builds experimental bpfilter framework that is aiming to
> +       provide netfilter compatible functionality via BPF
> +
> +if BPFILTER
> +config BPFILTER_UMH
> +     tristate "bpftiler kernel module with user mode helper"
sp. "bpftiler" -> "bpfilter"
> +     default m
> +     depends on m
> +     help
> +       This builds bpfilter kernel module with embedded user mode helper
> +endif
> +
> diff --git a/net/bpfilter/Makefile b/net/bpfilter/Makefile
> new file mode 100644
> index 000000000000..897eedae523e
> --- /dev/null
> +++ b/net/bpfilter/Makefile
> @@ -0,0 +1,24 @@
> +# SPDX-License-Identifier: GPL-2.0
> +#
> +# Makefile for the Linux BPFILTER layer.
> +#
> +
> +hostprogs-y := bpfilter_umh
> +bpfilter_umh-objs := main.o
> +HOSTCFLAGS += -I. -Itools/include/
> +
> +# a bit of elf magic to convert bpfilter_umh binary into a binary blob
> +# inside bpfilter_umh.o elf file referenced by
> +# _binary_net_bpfilter_bpfilter_umh_start symbol
> +# which bpfilter_kern.c passes further into umh blob loader at run-time
> +quiet_cmd_copy_umh = GEN $@
> +      cmd_copy_umh = echo ':' > $(obj)/.bpfilter_umh.o.cmd; \
> +      $(OBJCOPY) -I binary -O $(CONFIG_OUTPUT_FORMAT) \
> +      -B `$(OBJDUMP) -f $<|grep architecture|cut -d, -f1|cut -d' ' -f2` \
> +      --rename-section .data=.init.rodata $< $@
> +
> +$(obj)/bpfilter_umh.o: $(obj)/bpfilter_umh
> +     $(call cmd,copy_umh)
> +
> +obj-$(CONFIG_BPFILTER_UMH) += bpfilter.o
> +bpfilter-objs += bpfilter_kern.o bpfilter_umh.o
> diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c
> new file mode 100644
> index 000000000000..e0a6fdd5842b
> --- /dev/null
> +++ b/net/bpfilter/bpfilter_kern.c
> @@ -0,0 +1,93 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> +#include <linux/init.h>
> +#include <linux/module.h>
> +#include <linux/umh.h>
> +#include <linux/bpfilter.h>
> +#include <linux/sched.h>
> +#include <linux/sched/signal.h>
> +#include <linux/fs.h>
> +#include <linux/file.h>
> +#include "msgfmt.h"
> +
> +#define UMH_start _binary_net_bpfilter_bpfilter_umh_start
> +#define UMH_end _binary_net_bpfilter_bpfilter_umh_end
> +
> +extern char UMH_start;
> +extern char UMH_end;
> +
> +static struct umh_info info;
> +
> +static void shutdown_umh(struct umh_info *info)
> +{
> +     struct task_struct *tsk;
> +
> +     tsk = pid_task(find_vpid(info->pid), PIDTYPE_PID);
> +     if (tsk)
> +             force_sig(SIGKILL, tsk);
> +     fput(info->pipe_to_umh);
> +     fput(info->pipe_from_umh);
> +}
> +
> +static void stop_umh(void)
> +{
> +     if (bpfilter_process_sockopt) {
I worry about locking here.  Is it possible for two calls to
 bpfilter_process_sockopt() to run in parallel, both fail, and thus both
 call stop_umh()?  And if both end up calling shutdown_umh(), we double
 fput().
> +             bpfilter_process_sockopt = NULL;
> +             shutdown_umh(&info);
> +     }
> +}
> +
> +static int __bpfilter_process_sockopt(struct sock *sk, int optname,
> +                                   char __user *optval,
> +                                   unsigned int optlen, bool is_set)
> +{
> +     struct mbox_request req;
> +     struct mbox_reply reply;
> +     loff_t pos;
> +     ssize_t n;
> +
> +     req.is_set = is_set;
> +     req.pid = current->pid;
> +     req.cmd = optname;
> +     req.addr = (long)optval;
> +     req.len = optlen;
> +     n = __kernel_write(info.pipe_to_umh, &req, sizeof(req), &pos);
> +     if (n != sizeof(req)) {
> +             pr_err("write fail %zd\n", n);
> +             stop_umh();
> +             return -EFAULT;
> +     }
> +     pos = 0;
> +     n = kernel_read(info.pipe_from_umh, &reply, sizeof(reply), &pos);
> +     if (n != sizeof(reply)) {
> +             pr_err("read fail %zd\n", n);
> +             stop_umh();
> +             return -EFAULT;
> +     }
> +     return reply.status;
> +}
> +
> +static int __init load_umh(void)
> +{
> +     int err;
> +
> +     err = fork_usermode_blob(&UMH_start, &UMH_end - &UMH_start, &info);
> +     if (err)
> +             return err;
> +     pr_info("Loaded umh pid %d\n", info.pid);
> +     bpfilter_process_sockopt = &__bpfilter_process_sockopt;
> +
> +     if (__bpfilter_process_sockopt(NULL, 0, 0, 0, 0) != 0) {
> +             stop_umh();
> +             return -EFAULT;
> +     }
> +     return 0;
> +}
> +
> +static void __exit fini_umh(void)
> +{
> +     stop_umh();
> +}
> +module_init(load_umh);
> +module_exit(fini_umh);
> +MODULE_LICENSE("GPL");
> diff --git a/net/bpfilter/main.c b/net/bpfilter/main.c
> new file mode 100644
> index 000000000000..81bbc1684896
> --- /dev/null
> +++ b/net/bpfilter/main.c
> @@ -0,0 +1,63 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#define _GNU_SOURCE
> +#include <sys/uio.h>
> +#include <errno.h>
> +#include <stdio.h>
> +#include <sys/socket.h>
> +#include <fcntl.h>
> +#include <unistd.h>
> +#include "include/uapi/linux/bpf.h"
> +#include <asm/unistd.h>
> +#include "msgfmt.h"
> +
> +int debug_fd;
> +
> +static int handle_get_cmd(struct mbox_request *cmd)
> +{
> +     switch (cmd->cmd) {
> +     case 0:
> +             return 0;
> +     default:
> +             break;
> +     }
> +     return -ENOPROTOOPT;
> +}
> +
> +static int handle_set_cmd(struct mbox_request *cmd)
> +{
> +     return -ENOPROTOOPT;
> +}
> +
> +static void loop(void)
> +{
> +     while (1) {
> +             struct mbox_request req;
> +             struct mbox_reply reply;
> +             int n;
> +
> +             n = read(0, &req, sizeof(req));
> +             if (n != sizeof(req)) {
> +                     dprintf(debug_fd, "invalid request %d\n", n);
> +                     return;
> +             }
> +
> +             reply.status = req.is_set ?
> +                     handle_set_cmd(&req) :
> +                     handle_get_cmd(&req);
> +
> +             n = write(1, &reply, sizeof(reply));
> +             if (n != sizeof(reply)) {
> +                     dprintf(debug_fd, "reply failed %d\n", n);
> +                     return;
> +             }
> +     }
> +}
> +
> +int main(void)
> +{
> +     debug_fd = open("/dev/console", 00000002 | 00000100);
Should probably handle failure of this open() call.
> +     dprintf(debug_fd, "Started bpfilter\n");
> +     loop();
> +     close(debug_fd);
> +     return 0;
> +}
> diff --git a/net/bpfilter/msgfmt.h b/net/bpfilter/msgfmt.h
> new file mode 100644
> index 000000000000..94b9ac9e5114
> --- /dev/null
> +++ b/net/bpfilter/msgfmt.h
> @@ -0,0 +1,17 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _NET_BPFILTER_MSGFMT_H
> +#define _NET_BPFTILER_MSGFMT_H
Another bpftiler here, should be
+#define _NET_BPFILTER_MSGFMT_H

-Ed
> +
> +struct mbox_request {
> +     __u64 addr;
> +     __u32 len;
> +     __u32 is_set;
> +     __u32 cmd;
> +     __u32 pid;
> +};
> +
> +struct mbox_reply {
> +     __u32 status;
> +};
> +
> +#endif
> diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
> index b379520f9133..7018f91c5a39 100644
> --- a/net/ipv4/Makefile
> +++ b/net/ipv4/Makefile
> @@ -16,6 +16,8 @@ obj-y     := route.o inetpeer.o protocol.o \
>            inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \
>            metrics.o
>  
> +obj-$(CONFIG_BPFILTER) += bpfilter/
> +
>  obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o
>  obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
>  obj-$(CONFIG_PROC_FS) += proc.o
> diff --git a/net/ipv4/bpfilter/Makefile b/net/ipv4/bpfilter/Makefile
> new file mode 100644
> index 000000000000..ce262d76cc48
> --- /dev/null
> +++ b/net/ipv4/bpfilter/Makefile
> @@ -0,0 +1,2 @@
> +obj-$(CONFIG_BPFILTER) += sockopt.o
> +
> diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c
> new file mode 100644
> index 000000000000..42a96d2d8d05
> --- /dev/null
> +++ b/net/ipv4/bpfilter/sockopt.c
> @@ -0,0 +1,42 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#include <linux/uaccess.h>
> +#include <linux/bpfilter.h>
> +#include <uapi/linux/bpf.h>
> +#include <linux/wait.h>
> +#include <linux/kmod.h>
> +
> +int (*bpfilter_process_sockopt)(struct sock *sk, int optname,
> +                             char __user *optval,
> +                             unsigned int optlen, bool is_set);
> +EXPORT_SYMBOL_GPL(bpfilter_process_sockopt);
> +
> +int bpfilter_mbox_request(struct sock *sk, int optname, char __user *optval,
> +                       unsigned int optlen, bool is_set)
> +{
> +     if (!bpfilter_process_sockopt) {
> +             int err = request_module("bpfilter");
> +
> +             if (err)
> +                     return err;
> +             if (!bpfilter_process_sockopt)
> +                     return -ECHILD;
> +     }
> +     return bpfilter_process_sockopt(sk, optname, optval, optlen, is_set);
> +}
> +
> +int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char __user 
> *optval,
> +                         unsigned int optlen)
> +{
> +     return bpfilter_mbox_request(sk, optname, optval, optlen, true);
> +}
> +
> +int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user 
> *optval,
> +                         int __user *optlen)
> +{
> +     int len;
> +
> +     if (get_user(len, optlen))
> +             return -EFAULT;
> +
> +     return bpfilter_mbox_request(sk, optname, optval, len, false);
> +}
> diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
> index 5ad2d8ed3a3f..e0791faacb24 100644
> --- a/net/ipv4/ip_sockglue.c
> +++ b/net/ipv4/ip_sockglue.c
> @@ -47,6 +47,8 @@
>  #include <linux/errqueue.h>
>  #include <linux/uaccess.h>
>  
> +#include <linux/bpfilter.h>
> +
>  /*
>   *   SOL_IP control messages.
>   */
> @@ -1244,6 +1246,11 @@ int ip_setsockopt(struct sock *sk, int level,
>               return -ENOPROTOOPT;
>  
>       err = do_ip_setsockopt(sk, level, optname, optval, optlen);
> +#ifdef CONFIG_BPFILTER
> +     if (optname >= BPFILTER_IPT_SO_SET_REPLACE &&
> +         optname < BPFILTER_IPT_SET_MAX)
> +             err = bpfilter_ip_set_sockopt(sk, optname, optval, optlen);
> +#endif
>  #ifdef CONFIG_NETFILTER
>       /* we need to exclude all possible ENOPROTOOPTs except default case */
>       if (err == -ENOPROTOOPT && optname != IP_HDRINCL &&
> @@ -1552,6 +1559,11 @@ int ip_getsockopt(struct sock *sk, int level,
>       int err;
>  
>       err = do_ip_getsockopt(sk, level, optname, optval, optlen, 0);
> +#ifdef CONFIG_BPFILTER
> +     if (optname >= BPFILTER_IPT_SO_GET_INFO &&
> +         optname < BPFILTER_IPT_GET_MAX)
> +             err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen);
> +#endif
>  #ifdef CONFIG_NETFILTER
>       /* we need to exclude all possible ENOPROTOOPTs except default case */
>       if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS &&
> @@ -1584,6 +1596,11 @@ int compat_ip_getsockopt(struct sock *sk, int level, 
> int optname,
>       err = do_ip_getsockopt(sk, level, optname, optval, optlen,
>               MSG_CMSG_COMPAT);
>  
> +#ifdef CONFIG_BPFILTER
> +     if (optname >= BPFILTER_IPT_SO_GET_INFO &&
> +         optname < BPFILTER_IPT_GET_MAX)
> +             err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen);
> +#endif
>  #ifdef CONFIG_NETFILTER
>       /* we need to exclude all possible ENOPROTOOPTs except default case */
>       if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS &&


Reply via email to