On 01/16/2018 06:25 PM, Alexei Starovoitov wrote:
> On Fri, Jan 12, 2018 at 10:11:11AM -0800, John Fastabend wrote:
>> This implements a BPF ULP layer to allow policy enforcement and
>> monitoring at the socket layer. In order to support this a new
>> program type BPF_PROG_TYPE_SK_MSG is used to run the policy at
>> the sendmsg/sendpage hook. To attach the policy to sockets a
>> sockmap is used with a new program attach type BPF_SK_MSG_VERDICT.

[...]

> 
> overall design looks clean. imo huge improvement from first version.
> 

Great thanks for the review.

> Few nits:
> 

[...]

>> +
>> +static int bpf_tcp_push(struct sock *sk, struct scatterlist *sg,
>> +                    int *sg_end, int flags, bool charge)
>> +{
>> +    int sendpage_flags = flags | MSG_SENDPAGE_NOTLAST;
>> +    int offset, ret = 0;
>> +    struct page *p;
>> +    size_t size;
>> +
>> +    size = sg->length;
>> +    offset = sg->offset;
>> +
>> +    while (1) {
>> +            if (sg_is_last(sg))
>> +                    sendpage_flags = flags;
>> +
>> +            tcp_rate_check_app_limited(sk);
>> +            p = sg_page(sg);
>> +retry:
>> +            ret = do_tcp_sendpages(sk, p, offset, size, sendpage_flags);
>> +            if (ret != size) {
>> +                    if (ret > 0) {
>> +                            offset += ret;
>> +                            size -= ret;
>> +                            goto retry;
>> +                    }
>> +
>> +                    if (charge)
>> +                            sk_mem_uncharge(sk,
>> +                                            sg->length - size - sg->offset);
> 
> should the bool argument be called 'uncharge' instead ?
> 

Agreed that would be a better name, will update.

>> +
>> +                    sg->offset = offset;
>> +                    sg->length = size;
>> +                    return ret;
>> +            }
>> +
>> +            put_page(p);
>> +            if (charge)
>> +                    sk_mem_uncharge(sk, sg->length);
>> +            *sg_end += 1;
>> +            sg = sg_next(sg);
>> +            if (!sg)
>> +                    break;
>> +
>> +            offset = sg->offset;
>> +            size = sg->length;
>> +    }
>> +
>> +    return 0;
>> +}

[...]

>> +static int bpf_tcp_sendmsg_do_redirect(struct scatterlist *sg, int sg_num,
>> +                                   struct sk_msg_buff *md, int flags)
>> +{
>> +    int i, sg_curr = 0, err, free;
>> +    struct smap_psock *psock;
>> +    struct sock *sk;
>> +
>> +    rcu_read_lock();
>> +    sk = do_msg_redirect_map(md);
>> +    if (unlikely(!sk))
>> +            goto out_rcu;
>> +
>> +    psock = smap_psock_sk(sk);
>> +    if (unlikely(!psock))
>> +            goto out_rcu;
>> +
>> +    if (!refcount_inc_not_zero(&psock->refcnt))
>> +            goto out_rcu;
>> +
>> +    rcu_read_unlock();
>> +    lock_sock(sk);
>> +    err = bpf_tcp_push(sk, sg, &sg_curr, flags, false);
>> +    if (unlikely(err))
>> +            goto out;
>> +    release_sock(sk);
>> +    smap_release_sock(psock, sk);
>> +    return 0;
>> +out_rcu:
>> +    rcu_read_unlock();
>> +out:
>> +    for (i = sg_curr; i < sg_num; ++i) {
>> +            free += sg[i].length;
>> +            put_page(sg_page(&sg[i]));
>> +    }
>> +    return free;
> 
> erro path keeps rcu_lock and sk locked?
> 

yep, although looks like rcu_read_unlock() is OK because its
released above the call but the

   if (unlikely(err))
        goto err

needs to be moved below the smap_release_sock(). Thanks!

>> +}
>> +

[...]

>> +    while (msg_data_left(msg)) {
>> +            int sg_curr;
>> +
>> +            if (sk->sk_err) {
>> +                    err = sk->sk_err;
>> +                    goto out_err;
>> +            }
>> +
>> +            copy = msg_data_left(msg);
>> +            if (!sk_stream_memory_free(sk))
>> +                    goto wait_for_sndbuf;
>> +
>> +            /* sg_size indicates bytes already allocated and sg_num
>> +             * is last sg element used. This is used when alloc_sg
>> +             * partially allocates a scatterlist and then is sent
>> +             * to wait for memory. In normal case (no memory pressure)
>> +             * both sg_nun and sg_size are zero.
>> +             */
>> +            copy = copy - sg_size;
>> +            err = sk_alloc_sg(sk, copy, sg, &sg_num, &sg_size, 0);
>> +            if (err) {
>> +                    if (err != -ENOSPC)
>> +                            goto wait_for_memory;
>> +                    copy = sg_size;
>> +            }
>> +
>> +            err = memcopy_from_iter(sk, sg, sg_num, &msg->msg_iter, copy);
>> +            if (err < 0) {
>> +                    free_sg(sk, sg, 0, sg_num);
>> +                    goto out_err;
>> +            }
>> +
>> +            copied += copy;
>> +
>> +            /* If msg is larger than MAX_SKB_FRAGS we can send multiple
>> +             * scatterlists per msg. However BPF decisions apply to the
>> +             * entire msg.
>> +             */
>> +            if (eval == __SK_NONE)
>> +                    eval = smap_do_tx_msg(sk, psock, &md);
> 
> it seems sk_alloc_sg() will put 64k bytes into sg_data,

Yep upto 64k will be copied from msg into sg data.

> but this program will see only first SG ?

Correct, to read further into the msg we would need to have a helper
or some other way to catch reads/writes past the first 4k and read
the next sg. We have the same limitation in cls_bpf.

I have started a patch on top of this series with the current working
title msg_apply_bytes(int bytes). This would let us apply a verdict to
a set number of bytes instead of the entire msg. By calling
msg_apply_bytes(data_end - data) a user who needs to read an entire msg
could do this in 4k chunks until the entire msg is passed through the
bpf prog.

> and it's typically going to be one page only ?

yep

> then what's the value of waiting for MAX_SKB_FRAGS ?
> 
Its not waiting for MAX_SKB_FRAGS its simple copying up to MAX_SKB_FRAGS
pages in one call if possible. It seems better to me to run this loop
over as much data as we can.

>> +
>> +            switch (eval) {
>> +            case __SK_PASS:
>> +                    sg_mark_end(sg + sg_num - 1);
>> +                    err = bpf_tcp_push(sk, sg, &sg_curr, flags, true);
>> +                    if (unlikely(err)) {
>> +                            copied -= free_sg(sk, sg, sg_curr, sg_num);
>> +                            goto out_err;
>> +                    }
>> +                    break;
>> +            case __SK_REDIRECT:
>> +                    sg_mark_end(sg + sg_num - 1);
>> +                    goto do_redir;
>> +            case __SK_DROP:
>> +            default:
>> +                    copied -= free_sg(sk, sg, 0, sg_num);
>> +                    goto out_err;
>> +            }

[...]

>> +    /* Calculate pkt data pointers and run BPF program */
>> +    md.data = page_address(page) + offset;
>> +    md.data_end = md.data + size;
>> +    _rc = (*prog->bpf_func)(&md, prog->insnsi);
>> +
>> +verdict:
>> +    rcu_read_unlock();
>> +    preempt_enable();
>> +
>> +    /* Moving return codes from UAPI namespace into internal namespace */
>> +    rc = ((_rc == SK_PASS) ? __SK_PASS : __SK_DROP);
>> +
>> +    switch (rc) {
>> +    case __SK_PASS:
>> +            lock_sock(sk);
>> +            rc = tcp_sendpage_locked(sk, page, offset, size, flags);
>> +            release_sock(sk);
>> +            break;
>> +    case __SK_REDIRECT:
>> +            smap_release_sock(psock, sk);
>> +            rc = bpf_tcp_sendpage_do_redirect(page, offset, size, flags,
>> +                                              &md);
> 
> looks like this path wasn't tested,

:/ yep adding test now.

> since above rc = ...; line cannot return REDIRECT...
> probably should be common helper for both tcp_bpf_*() funcs
> to call into bpf and convert rc.
> 

will do in v2 thanks.

>> +            break;
>> +    case __SK_DROP:
>> +    default:
>> +            rc = -EACCES;
>> +    }
>> +
>> +    return rc;
>> +}
>> +
>> +static int bpf_tcp_msg_add(struct smap_psock *psock,
>> +                       struct sock *sk,
>> +                       struct bpf_prog *tx_msg)
>> +{
>> +    struct bpf_prog *orig_tx_msg;
>> +
>> +    orig_tx_msg = xchg(&psock->bpf_tx_msg, tx_msg);
>> +    if (orig_tx_msg)
>> +            bpf_prog_put(orig_tx_msg);
> 
> the function is replacing the program. why is it called bpf_tcp_msg_add ?
> 

bad name will rename.

>> +
>> +    return tcp_set_ulp_id(sk, TCP_ULP_BPF);
>> +}
>> +

[...]

>> +static int bpf_tcp_ulp_register(void)
>> +{
>> +    tcp_bpf_proto = tcp_prot;
>> +    tcp_bpf_proto.sendmsg = bpf_tcp_sendmsg;
>> +    tcp_bpf_proto.sendpage = bpf_tcp_sendpage;
>> +    return tcp_register_ulp(&bpf_tcp_ulp_ops);
> 
> I don't see corresponding tcp_unregister_ulp().
> 

There is none. tcp_register_ulp() adds the bpf_tcp_ulp to the list of
available ULPs and never removes it. To remove it we would have to
keep a ref count on the reg/unreg calls. This would require a couple
more patches to the ULP infra and not sure it hurts to leave the ULP
reference around...

Maybe a follow on patch? Or else it could be a patch in front of this
patch.

>> @@ -710,6 +1151,7 @@ static int sock_map_ctx_update_elem(struct 
>> bpf_sock_ops_kern *skops,
>>       */
>>      verdict = READ_ONCE(stab->bpf_verdict);
>>      parse = READ_ONCE(stab->bpf_parse);
>> +    tx_msg = READ_ONCE(stab->bpf_tx_msg);
>>  
>>      if (parse && verdict) {
>>              /* bpf prog refcnt may be zero if a concurrent attach operation
>> @@ -728,6 +1170,17 @@ static int sock_map_ctx_update_elem(struct 
>> bpf_sock_ops_kern *skops,
>>              }
>>      }
>>  
>> +    if (tx_msg) {
>> +            tx_msg = bpf_prog_inc_not_zero(stab->bpf_tx_msg);
> 
> prog_inc_not_zero() looks scary here.
> Why 'not_zero' is necessary ?
> 

Same reason we use inc_not_zero variants with the verdict/parse
programs,

 /* bpf prog refcnt may be zero if a concurrent attach operation
  * removes the program after the above READ_ONCE() but before
  * we increment the refcnt. If this is the case abort with an
  * error.
  */

>> +            if (IS_ERR(tx_msg)) {
>> +                    if (verdict)
>> +                            bpf_prog_put(verdict);
>> +                    if (parse)
>> +                            bpf_prog_put(parse);
>> +                    return PTR_ERR(tx_msg);
>> +            }
>> +    }
>> +

Thanks!

Reply via email to