On Tue, Mar 4, 2014 at 2:17 PM, Alexei Starovoitov <a...@plumgrid.com> wrote: > use sk_convert_filter() to convert seccomp BPF into extended BPF > > 05-sim-long_jumps.c of libseccomp was used as micro-benchmark: > seccomp_rule_add_exact(ctx,... > seccomp_rule_add_exact(ctx,... > rc = seccomp_load(ctx); > for (i = 0; i < 10000000; i++) > syscall(199, 100); > > --x86_64-- > old BPF: 8.6 seconds > 73.38% bench [kernel.kallsyms] [k] sk_run_filter > 10.70% bench libc-2.15.so [.] syscall > 5.09% bench [kernel.kallsyms] [k] seccomp_bpf_load > 1.97% bench [kernel.kallsyms] [k] system_call > ext BPF: 5.7 seconds > 66.20% bench [kernel.kallsyms] [k] sk_run_filter_ext > 16.75% bench libc-2.15.so [.] syscall > 3.31% bench [kernel.kallsyms] [k] system_call > 2.88% bench [kernel.kallsyms] [k] __secure_computing > > --i386--- > old BPF: 5.4 sec > ext BPF: 3.8 sec > > BPF filters generated by seccomp are very branchy, so ext BPF > performance is better than old BPF. > > Performance gains will be even higher when extended BPF JIT > is committed. > > Signed-off-by: Alexei Starovoitov <a...@plumgrid.com> > > --- > This patch is an RFC to use extended BPF in seccomp. > Change it to do it conditionally with bpf_ext_enable knob ?
Kees, Will, I've played with libseccomp to test this patch and just found your other testsuite for bpf+seccomp: https://github.com/redpig/seccomp It passes as well on x86_64 and i386. Not sure how real all 'seccomp' and libseccomp' bpf filters. Some of them are very short, some very long. If average number of filtered syscalls is > 10, then upcoming 'ebpf table' will give another performance boost, since single table lookup will be faster than long sequence of 'if' conditions. Please review. Thanks Alexei > --- > include/linux/seccomp.h | 1 - > kernel/seccomp.c | 112 > +++++++++++++++++++++-------------------------- > net/core/filter.c | 5 --- > 3 files changed, 51 insertions(+), 67 deletions(-) > > diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h > index 6f19cfd1840e..4054b0994071 100644 > --- a/include/linux/seccomp.h > +++ b/include/linux/seccomp.h > @@ -76,7 +76,6 @@ static inline int seccomp_mode(struct seccomp *s) > #ifdef CONFIG_SECCOMP_FILTER > extern void put_seccomp_filter(struct task_struct *tsk); > extern void get_seccomp_filter(struct task_struct *tsk); > -extern u32 seccomp_bpf_load(int off); > #else /* CONFIG_SECCOMP_FILTER */ > static inline void put_seccomp_filter(struct task_struct *tsk) > { > diff --git a/kernel/seccomp.c b/kernel/seccomp.c > index b7a10048a32c..346c597b44cc 100644 > --- a/kernel/seccomp.c > +++ b/kernel/seccomp.c > @@ -55,60 +55,27 @@ struct seccomp_filter { > atomic_t usage; > struct seccomp_filter *prev; > unsigned short len; /* Instruction count */ > - struct sock_filter insns[]; > + struct sock_filter_ext insns[]; > }; > > /* Limit any path through the tree to 256KB worth of instructions. */ > #define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter)) > > -/** > - * get_u32 - returns a u32 offset into data > - * @data: a unsigned 64 bit value > - * @index: 0 or 1 to return the first or second 32-bits > - * > - * This inline exists to hide the length of unsigned long. If a 32-bit > - * unsigned long is passed in, it will be extended and the top 32-bits will > be > - * 0. If it is a 64-bit unsigned long, then whatever data is resident will be > - * properly returned. > - * > +/* > * Endianness is explicitly ignored and left for BPF program authors to > manage > * as per the specific architecture. > */ > -static inline u32 get_u32(u64 data, int index) > -{ > - return ((u32 *)&data)[index]; > -} > - > -/* Helper for bpf_load below. */ > -#define BPF_DATA(_name) offsetof(struct seccomp_data, _name) > -/** > - * bpf_load: checks and returns a pointer to the requested offset > - * @off: offset into struct seccomp_data to load from > - * > - * Returns the requested 32-bits of data. > - * seccomp_check_filter() should assure that @off is 32-bit aligned > - * and not out of bounds. Failure to do so is a BUG. > - */ > -u32 seccomp_bpf_load(int off) > +static void populate_seccomp_data(struct seccomp_data *sd) > { > struct pt_regs *regs = task_pt_regs(current); > - if (off == BPF_DATA(nr)) > - return syscall_get_nr(current, regs); > - if (off == BPF_DATA(arch)) > - return syscall_get_arch(current, regs); > - if (off >= BPF_DATA(args[0]) && off < BPF_DATA(args[6])) { > - unsigned long value; > - int arg = (off - BPF_DATA(args[0])) / sizeof(u64); > - int index = !!(off % sizeof(u64)); > - syscall_get_arguments(current, regs, arg, 1, &value); > - return get_u32(value, index); > - } > - if (off == BPF_DATA(instruction_pointer)) > - return get_u32(KSTK_EIP(current), 0); > - if (off == BPF_DATA(instruction_pointer) + sizeof(u32)) > - return get_u32(KSTK_EIP(current), 1); > - /* seccomp_check_filter should make this impossible. */ > - BUG(); > + int i; > + > + sd->nr = syscall_get_nr(current, regs); > + sd->arch = syscall_get_arch(current, regs); > + for (i = 0; i < 6; i++) > + syscall_get_arguments(current, regs, i, 1, > + (unsigned long *)&sd->args[i]); > + sd->instruction_pointer = KSTK_EIP(current); > } > > /** > @@ -133,17 +100,17 @@ static int seccomp_check_filter(struct sock_filter > *filter, unsigned int flen) > > switch (code) { > case BPF_S_LD_W_ABS: > - ftest->code = BPF_S_ANC_SECCOMP_LD_W; > + ftest->code = BPF_LDX | BPF_W | BPF_ABS; > /* 32-bit aligned and not out of bounds. */ > if (k >= sizeof(struct seccomp_data) || k & 3) > return -EINVAL; > continue; > case BPF_S_LD_W_LEN: > - ftest->code = BPF_S_LD_IMM; > + ftest->code = BPF_LD | BPF_IMM; > ftest->k = sizeof(struct seccomp_data); > continue; > case BPF_S_LDX_W_LEN: > - ftest->code = BPF_S_LDX_IMM; > + ftest->code = BPF_LDX | BPF_IMM; > ftest->k = sizeof(struct seccomp_data); > continue; > /* Explicitly include allowed calls. */ > @@ -185,6 +152,7 @@ static int seccomp_check_filter(struct sock_filter > *filter, unsigned int flen) > case BPF_S_JMP_JGT_X: > case BPF_S_JMP_JSET_K: > case BPF_S_JMP_JSET_X: > + sk_decode_filter(ftest, ftest); > continue; > default: > return -EINVAL; > @@ -202,18 +170,21 @@ static int seccomp_check_filter(struct sock_filter > *filter, unsigned int flen) > static u32 seccomp_run_filters(int syscall) > { > struct seccomp_filter *f; > + struct seccomp_data sd; > u32 ret = SECCOMP_RET_ALLOW; > > /* Ensure unexpected behavior doesn't result in failing open. */ > if (WARN_ON(current->seccomp.filter == NULL)) > return SECCOMP_RET_KILL; > > + populate_seccomp_data(&sd); > + > /* > * All filters in the list are evaluated and the lowest BPF return > * value always takes priority (ignoring the DATA). > */ > for (f = current->seccomp.filter; f; f = f->prev) { > - u32 cur_ret = sk_run_filter(NULL, f->insns); > + u32 cur_ret = sk_run_filter_ext(&sd, f->insns); > if ((cur_ret & SECCOMP_RET_ACTION) < (ret & > SECCOMP_RET_ACTION)) > ret = cur_ret; > } > @@ -231,6 +202,8 @@ static long seccomp_attach_filter(struct sock_fprog > *fprog) > struct seccomp_filter *filter; > unsigned long fp_size = fprog->len * sizeof(struct sock_filter); > unsigned long total_insns = fprog->len; > + struct sock_filter *fp; > + int new_len; > long ret; > > if (fprog->len == 0 || fprog->len > BPF_MAXINSNS) > @@ -252,28 +225,42 @@ static long seccomp_attach_filter(struct sock_fprog > *fprog) > CAP_SYS_ADMIN) != 0) > return -EACCES; > > - /* Allocate a new seccomp_filter */ > - filter = kzalloc(sizeof(struct seccomp_filter) + fp_size, > - GFP_KERNEL|__GFP_NOWARN); > - if (!filter) > + fp = kzalloc(fp_size, GFP_KERNEL|__GFP_NOWARN); > + if (!fp) > return -ENOMEM; > - atomic_set(&filter->usage, 1); > - filter->len = fprog->len; > > /* Copy the instructions from fprog. */ > ret = -EFAULT; > - if (copy_from_user(filter->insns, fprog->filter, fp_size)) > - goto fail; > + if (copy_from_user(fp, fprog->filter, fp_size)) > + goto free_prog; > > /* Check and rewrite the fprog via the skb checker */ > - ret = sk_chk_filter(filter->insns, filter->len); > + ret = sk_chk_filter(fp, fprog->len); > if (ret) > - goto fail; > + goto free_prog; > > /* Check and rewrite the fprog for seccomp use */ > - ret = seccomp_check_filter(filter->insns, filter->len); > + ret = seccomp_check_filter(fp, fprog->len); > if (ret) > - goto fail; > + goto free_prog; > + > + /* convert 'sock_filter' insns to 'sock_filter_ext' insns */ > + ret = sk_convert_filter(fp, fprog->len, NULL, &new_len); > + if (ret) > + goto free_prog; > + > + /* Allocate a new seccomp_filter */ > + filter = kzalloc(sizeof(struct seccomp_filter) + > + sizeof(struct sock_filter_ext) * new_len, > + GFP_KERNEL|__GFP_NOWARN); > + if (!filter) > + goto free_prog; > + > + ret = sk_convert_filter(fp, fprog->len, filter->insns, &new_len); > + if (ret) > + goto free_filter; > + atomic_set(&filter->usage, 1); > + filter->len = new_len; > > /* > * If there is an existing filter, make it the prev and don't drop its > @@ -282,8 +269,11 @@ static long seccomp_attach_filter(struct sock_fprog > *fprog) > filter->prev = current->seccomp.filter; > current->seccomp.filter = filter; > return 0; > -fail: > + > +free_filter: > kfree(filter); > +free_prog: > + kfree(fp); > return ret; > } > > diff --git a/net/core/filter.c b/net/core/filter.c > index 1494421486b7..f144a6a7d939 100644 > --- a/net/core/filter.c > +++ b/net/core/filter.c > @@ -388,11 +388,6 @@ load_b: > A = 0; > continue; > } > -#ifdef CONFIG_SECCOMP_FILTER > - case BPF_S_ANC_SECCOMP_LD_W: > - A = seccomp_bpf_load(fentry->k); > - continue; > -#endif > default: > WARN_RATELIMIT(1, "Unknown code:%u jt:%u tf:%u > k:%u\n", > fentry->code, fentry->jt, > -- > 1.7.9.5 > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/