On 11/25/18 5:05 PM, Daniel Borkmann wrote:
> On 11/23/2018 02:38 AM, John Fastabend wrote:
>> This adds a BPF SK_MSG program helper so that we can pop data from a
>> msg. We use this to pop metadata from a previous push data call.
>>
>> Signed-off-by: John Fastabend <john.fastab...@gmail.com>
>> ---
>>  include/uapi/linux/bpf.h |  13 +++-
>>  net/core/filter.c        | 169 
>> +++++++++++++++++++++++++++++++++++++++++++++++
>>  net/ipv4/tcp_bpf.c       |  14 +++-
>>  3 files changed, 192 insertions(+), 4 deletions(-)
>>
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index c1554aa..64681f8 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -2268,6 +2268,16 @@ union bpf_attr {
>>   *
>>   *  Return
>>   *          0 on success, or a negative error in case of failure.
>> + *
>> + * int bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 pop, u64 
>> flags)
>> + *   Description
>> + *          Will remove 'pop' bytes from a msg starting at byte 'start'.
>> + *          This result in ENOMEM errors under certain situations where
>> + *          a allocation and copy are required due to a full ring buffer.
>> + *          However, the helper will try to avoid doing the allocation
>> + *          if possible. Other errors can occur if input parameters are
>> + *          invalid either do to start byte not being valid part of msg
>> + *          payload and/or pop value being to large.
>>   */
>>  #define __BPF_FUNC_MAPPER(FN)               \
>>      FN(unspec),                     \
>> @@ -2360,7 +2370,8 @@ union bpf_attr {
>>      FN(map_push_elem),              \
>>      FN(map_pop_elem),               \
>>      FN(map_peek_elem),              \
>> -    FN(msg_push_data),
>> +    FN(msg_push_data),              \
>> +    FN(msg_pop_data),
>>  
>>  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>>   * function eBPF program intends to call
>> diff --git a/net/core/filter.c b/net/core/filter.c
>> index f6ca38a..c6b35b5 100644
>> --- a/net/core/filter.c
>> +++ b/net/core/filter.c
>> @@ -2428,6 +2428,173 @@ static const struct bpf_func_proto 
>> bpf_msg_push_data_proto = {
>>      .arg4_type      = ARG_ANYTHING,
>>  };
>>  
>> +static void sk_msg_shift_left(struct sk_msg *msg, int i)
>> +{
>> +    int prev;
>> +
>> +    do {
>> +            prev = i;
>> +            sk_msg_iter_var_next(i);
>> +            msg->sg.data[prev] = msg->sg.data[i];
>> +    } while (i != msg->sg.end);
>> +
>> +    sk_msg_iter_prev(msg, end);
>> +}
>> +
>> +static void sk_msg_shift_right(struct sk_msg *msg, int i)
>> +{
>> +    struct scatterlist tmp, sge;
>> +
>> +    sk_msg_iter_next(msg, end);
>> +    sge = sk_msg_elem_cpy(msg, i);
>> +    sk_msg_iter_var_next(i);
>> +    tmp = sk_msg_elem_cpy(msg, i);
>> +
>> +    while (i != msg->sg.end) {
>> +            msg->sg.data[i] = sge;
>> +            sk_msg_iter_var_next(i);
>> +            sge = tmp;
>> +            tmp = sk_msg_elem_cpy(msg, i);
>> +    }
>> +}
>> +
>> +BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
>> +       u32, len, u64, flags)
>> +{
>> +    u32 i = 0, l, space, offset = 0;
>> +    u64 last = start + len;
>> +    int pop;
>> +
>> +    if (unlikely(flags))
>> +            return -EINVAL;
>> +
>> +    /* First find the starting scatterlist element */
>> +    i = msg->sg.start;
>> +    do {
>> +            l = sk_msg_elem(msg, i)->length;
>> +
>> +            if (start < offset + l)
>> +                    break;
>> +            offset += l;
>> +            sk_msg_iter_var_next(i);
>> +    } while (i != msg->sg.end);
>> +
>> +    /* Bounds checks: start and pop must be inside message */
>> +    if (start >= offset + l || last >= msg->sg.size)
>> +            return -EINVAL;
>> +
>> +    space = MAX_MSG_FRAGS - sk_msg_elem_used(msg);
>> +
>> +    pop = len;
>> +    /* --------------| offset
>> +     * -| start      |------- len ------|
>> +     *
>> +     *  |----- a ----|-------- pop -------|----- b ----|
>> +     *  |______________________________________________| length
>> +     *
>> +     *
>> +     * a:   region at front of scatter element to save
>> +     * b:   region at back of scatter element to save when length > A + pop
>> +     * pop: region to pop from element, same as input 'pop' here will be
>> +     *      decremented below per iteration.
>> +     *
>> +     * Two top-level cases to handle when start != offset, first B is non
>> +     * zero and second B is zero corresponding to when a pop includes more
>> +     * than one element.
>> +     *
>> +     * Then if B is non-zero AND there is no space allocate space and
>> +     * compact A, B regions into page. If there is space shift ring to
>> +     * the rigth free'ing the next element in ring to place B, leaving
>> +     * A untouched except to reduce length.
>> +     */
>> +    if (start != offset) {
>> +            struct scatterlist *nsge, *sge = sk_msg_elem(msg, i);
>> +            int a = start;
>> +            int b = sge->length - pop - a;
>> +
>> +            sk_msg_iter_var_next(i);
>> +
>> +            if (pop < sge->length - a) {
>> +                    if (space) {
>> +                            sge->length = a;
>> +                            sk_msg_shift_right(msg, i);
>> +                            nsge = sk_msg_elem(msg, i);
>> +                            get_page(sg_page(sge));
>> +                            sg_set_page(nsge,
>> +                                        sg_page(sge), b, offset + pop);
>> +                    } else {
>> +                            struct page *page, *orig;
>> +                            u8 *to, *from;
>> +
>> +                            page = alloc_pages(__GFP_NOWARN |
>> +                                               __GFP_COMP   | GFP_ATOMIC,
>> +                                               get_order(a + b));
>> +                            if (unlikely(!page))
>> +                                    return -ENOMEM;
>> +
>> +                            sge->length = a;
>> +                            orig = sg_page(sge);
>> +                            from = sg_virt(sge);
>> +                            to = page_address(page);
>> +                            memcpy(to, from, a);
>> +                            memcpy(to + a, from + a + pop, b);
>> +                            sg_set_page(sge, page, a + b, 0);
>> +                            put_page(orig);
>> +                    }
>> +                    pop = 0;
>> +            } else if (pop >= sge->length - a) {
>> +                    sge->length = a;
>> +                    pop -= (sge->length - a);
>> +            }
>> +    }
>> +
>> +    /* From above the current layout _must_ be as follows,
>> +     *
>> +     * -| offset
>> +     * -| start
>> +     *
>> +     *  |---- pop ---|---------------- b ------------|
>> +     *  |____________________________________________| length
>> +     *
>> +     * Offset and start of the current msg elem are equal because in the
>> +     * previous case we handled offset != start and either consumed the
>> +     * entire element and advanced to the next element OR pop == 0.
>> +     *
>> +     * Two cases to handle here are first pop is less than the length
>> +     * leaving some remainder b above. Simply adjust the element's layout
>> +     * in this case. Or pop >= length of the element so that b = 0. In this
>> +     * case advance to next element decrementing pop.
>> +     */
>> +    while (pop) {
>> +            struct scatterlist *sge = sk_msg_elem(msg, i);
>> +
>> +            if (pop < sge->length) {
>> +                    sge->length -= pop;
>> +                    sge->offset += pop;
>> +                    pop = 0;
>> +            } else {
>> +                    pop -= sge->length;
>> +                    sk_msg_shift_left(msg, i);
>> +            }
>> +            sk_msg_iter_var_next(i);
>> +    }
>> +
>> +    sk_mem_uncharge(msg->sk, len - pop);
>> +    msg->sg.size -= (len - pop);
>> +    sk_msg_compute_data_pointers(msg);
>> +    return 0;
>> +}
> 
> Hmm, had to revert. I noticed this will have a use after free. While you do 
> the
> sk_msg_compute_data_pointers() there is nothing from verifier that forces a
> reload of the payload pointers that were set up before the helper call, so 
> they
> can still be used after the bpf_msg_pop_data() even though we dropped ref from
> page etc.

Thanks for catching this. I will send a v2 with bpf_msg_pop_data() added to
the list in bpf_helper_changes_pkt_data.

> Thanks,
> Daniel
> 

Reply via email to