On 01/16/2018 06:25 PM, Alexei Starovoitov wrote: > On Fri, Jan 12, 2018 at 10:11:11AM -0800, John Fastabend wrote: >> This implements a BPF ULP layer to allow policy enforcement and >> monitoring at the socket layer. In order to support this a new >> program type BPF_PROG_TYPE_SK_MSG is used to run the policy at >> the sendmsg/sendpage hook. To attach the policy to sockets a >> sockmap is used with a new program attach type BPF_SK_MSG_VERDICT.
[...] > > overall design looks clean. imo huge improvement from first version. > Great thanks for the review. > Few nits: > [...] >> + >> +static int bpf_tcp_push(struct sock *sk, struct scatterlist *sg, >> + int *sg_end, int flags, bool charge) >> +{ >> + int sendpage_flags = flags | MSG_SENDPAGE_NOTLAST; >> + int offset, ret = 0; >> + struct page *p; >> + size_t size; >> + >> + size = sg->length; >> + offset = sg->offset; >> + >> + while (1) { >> + if (sg_is_last(sg)) >> + sendpage_flags = flags; >> + >> + tcp_rate_check_app_limited(sk); >> + p = sg_page(sg); >> +retry: >> + ret = do_tcp_sendpages(sk, p, offset, size, sendpage_flags); >> + if (ret != size) { >> + if (ret > 0) { >> + offset += ret; >> + size -= ret; >> + goto retry; >> + } >> + >> + if (charge) >> + sk_mem_uncharge(sk, >> + sg->length - size - sg->offset); > > should the bool argument be called 'uncharge' instead ? > Agreed that would be a better name, will update. >> + >> + sg->offset = offset; >> + sg->length = size; >> + return ret; >> + } >> + >> + put_page(p); >> + if (charge) >> + sk_mem_uncharge(sk, sg->length); >> + *sg_end += 1; >> + sg = sg_next(sg); >> + if (!sg) >> + break; >> + >> + offset = sg->offset; >> + size = sg->length; >> + } >> + >> + return 0; >> +} [...] >> +static int bpf_tcp_sendmsg_do_redirect(struct scatterlist *sg, int sg_num, >> + struct sk_msg_buff *md, int flags) >> +{ >> + int i, sg_curr = 0, err, free; >> + struct smap_psock *psock; >> + struct sock *sk; >> + >> + rcu_read_lock(); >> + sk = do_msg_redirect_map(md); >> + if (unlikely(!sk)) >> + goto out_rcu; >> + >> + psock = smap_psock_sk(sk); >> + if (unlikely(!psock)) >> + goto out_rcu; >> + >> + if (!refcount_inc_not_zero(&psock->refcnt)) >> + goto out_rcu; >> + >> + rcu_read_unlock(); >> + lock_sock(sk); >> + err = bpf_tcp_push(sk, sg, &sg_curr, flags, false); >> + if (unlikely(err)) >> + goto out; >> + release_sock(sk); >> + smap_release_sock(psock, sk); >> + return 0; >> +out_rcu: >> + rcu_read_unlock(); >> +out: >> + for (i = sg_curr; i < sg_num; ++i) { >> + free += sg[i].length; >> + put_page(sg_page(&sg[i])); >> + } >> + return free; > > erro path keeps rcu_lock and sk locked? > yep, although looks like rcu_read_unlock() is OK because its released above the call but the if (unlikely(err)) goto err needs to be moved below the smap_release_sock(). Thanks! >> +} >> + [...] >> + while (msg_data_left(msg)) { >> + int sg_curr; >> + >> + if (sk->sk_err) { >> + err = sk->sk_err; >> + goto out_err; >> + } >> + >> + copy = msg_data_left(msg); >> + if (!sk_stream_memory_free(sk)) >> + goto wait_for_sndbuf; >> + >> + /* sg_size indicates bytes already allocated and sg_num >> + * is last sg element used. This is used when alloc_sg >> + * partially allocates a scatterlist and then is sent >> + * to wait for memory. In normal case (no memory pressure) >> + * both sg_nun and sg_size are zero. >> + */ >> + copy = copy - sg_size; >> + err = sk_alloc_sg(sk, copy, sg, &sg_num, &sg_size, 0); >> + if (err) { >> + if (err != -ENOSPC) >> + goto wait_for_memory; >> + copy = sg_size; >> + } >> + >> + err = memcopy_from_iter(sk, sg, sg_num, &msg->msg_iter, copy); >> + if (err < 0) { >> + free_sg(sk, sg, 0, sg_num); >> + goto out_err; >> + } >> + >> + copied += copy; >> + >> + /* If msg is larger than MAX_SKB_FRAGS we can send multiple >> + * scatterlists per msg. However BPF decisions apply to the >> + * entire msg. >> + */ >> + if (eval == __SK_NONE) >> + eval = smap_do_tx_msg(sk, psock, &md); > > it seems sk_alloc_sg() will put 64k bytes into sg_data, Yep upto 64k will be copied from msg into sg data. > but this program will see only first SG ? Correct, to read further into the msg we would need to have a helper or some other way to catch reads/writes past the first 4k and read the next sg. We have the same limitation in cls_bpf. I have started a patch on top of this series with the current working title msg_apply_bytes(int bytes). This would let us apply a verdict to a set number of bytes instead of the entire msg. By calling msg_apply_bytes(data_end - data) a user who needs to read an entire msg could do this in 4k chunks until the entire msg is passed through the bpf prog. > and it's typically going to be one page only ? yep > then what's the value of waiting for MAX_SKB_FRAGS ? > Its not waiting for MAX_SKB_FRAGS its simple copying up to MAX_SKB_FRAGS pages in one call if possible. It seems better to me to run this loop over as much data as we can. >> + >> + switch (eval) { >> + case __SK_PASS: >> + sg_mark_end(sg + sg_num - 1); >> + err = bpf_tcp_push(sk, sg, &sg_curr, flags, true); >> + if (unlikely(err)) { >> + copied -= free_sg(sk, sg, sg_curr, sg_num); >> + goto out_err; >> + } >> + break; >> + case __SK_REDIRECT: >> + sg_mark_end(sg + sg_num - 1); >> + goto do_redir; >> + case __SK_DROP: >> + default: >> + copied -= free_sg(sk, sg, 0, sg_num); >> + goto out_err; >> + } [...] >> + /* Calculate pkt data pointers and run BPF program */ >> + md.data = page_address(page) + offset; >> + md.data_end = md.data + size; >> + _rc = (*prog->bpf_func)(&md, prog->insnsi); >> + >> +verdict: >> + rcu_read_unlock(); >> + preempt_enable(); >> + >> + /* Moving return codes from UAPI namespace into internal namespace */ >> + rc = ((_rc == SK_PASS) ? __SK_PASS : __SK_DROP); >> + >> + switch (rc) { >> + case __SK_PASS: >> + lock_sock(sk); >> + rc = tcp_sendpage_locked(sk, page, offset, size, flags); >> + release_sock(sk); >> + break; >> + case __SK_REDIRECT: >> + smap_release_sock(psock, sk); >> + rc = bpf_tcp_sendpage_do_redirect(page, offset, size, flags, >> + &md); > > looks like this path wasn't tested, :/ yep adding test now. > since above rc = ...; line cannot return REDIRECT... > probably should be common helper for both tcp_bpf_*() funcs > to call into bpf and convert rc. > will do in v2 thanks. >> + break; >> + case __SK_DROP: >> + default: >> + rc = -EACCES; >> + } >> + >> + return rc; >> +} >> + >> +static int bpf_tcp_msg_add(struct smap_psock *psock, >> + struct sock *sk, >> + struct bpf_prog *tx_msg) >> +{ >> + struct bpf_prog *orig_tx_msg; >> + >> + orig_tx_msg = xchg(&psock->bpf_tx_msg, tx_msg); >> + if (orig_tx_msg) >> + bpf_prog_put(orig_tx_msg); > > the function is replacing the program. why is it called bpf_tcp_msg_add ? > bad name will rename. >> + >> + return tcp_set_ulp_id(sk, TCP_ULP_BPF); >> +} >> + [...] >> +static int bpf_tcp_ulp_register(void) >> +{ >> + tcp_bpf_proto = tcp_prot; >> + tcp_bpf_proto.sendmsg = bpf_tcp_sendmsg; >> + tcp_bpf_proto.sendpage = bpf_tcp_sendpage; >> + return tcp_register_ulp(&bpf_tcp_ulp_ops); > > I don't see corresponding tcp_unregister_ulp(). > There is none. tcp_register_ulp() adds the bpf_tcp_ulp to the list of available ULPs and never removes it. To remove it we would have to keep a ref count on the reg/unreg calls. This would require a couple more patches to the ULP infra and not sure it hurts to leave the ULP reference around... Maybe a follow on patch? Or else it could be a patch in front of this patch. >> @@ -710,6 +1151,7 @@ static int sock_map_ctx_update_elem(struct >> bpf_sock_ops_kern *skops, >> */ >> verdict = READ_ONCE(stab->bpf_verdict); >> parse = READ_ONCE(stab->bpf_parse); >> + tx_msg = READ_ONCE(stab->bpf_tx_msg); >> >> if (parse && verdict) { >> /* bpf prog refcnt may be zero if a concurrent attach operation >> @@ -728,6 +1170,17 @@ static int sock_map_ctx_update_elem(struct >> bpf_sock_ops_kern *skops, >> } >> } >> >> + if (tx_msg) { >> + tx_msg = bpf_prog_inc_not_zero(stab->bpf_tx_msg); > > prog_inc_not_zero() looks scary here. > Why 'not_zero' is necessary ? > Same reason we use inc_not_zero variants with the verdict/parse programs, /* bpf prog refcnt may be zero if a concurrent attach operation * removes the program after the above READ_ONCE() but before * we increment the refcnt. If this is the case abort with an * error. */ >> + if (IS_ERR(tx_msg)) { >> + if (verdict) >> + bpf_prog_put(verdict); >> + if (parse) >> + bpf_prog_put(parse); >> + return PTR_ERR(tx_msg); >> + } >> + } >> + Thanks!