[PATCH bpf-next 3/5] bpf: introduce bpf_get_callchain_stackid

Song Liu Fri, 10 Jul 2020 18:29:26 -0700

This helper is only used by BPF program attached to perf_event. If the
perf_event has PEBS entries, calling get_perf_callchain from BPF program
may cause unwinder errors. bpf_get_callchain_stackid serves as alternative
to bpf_get_stackid for these BPF programs.


Signed-off-by: Song Liu <songliubrav...@fb.com>
---
 include/linux/bpf.h            |  1 +
 include/uapi/linux/bpf.h       | 43 +++++++++++++++++++++++
 kernel/bpf/stackmap.c          | 63 ++++++++++++++++++++++++++--------
 kernel/bpf/verifier.c          |  4 ++-
 kernel/trace/bpf_trace.c       |  2 ++
 scripts/bpf_helpers_doc.py     |  2 ++
 tools/include/uapi/linux/bpf.h | 43 +++++++++++++++++++++++
 7 files changed, 142 insertions(+), 16 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 0cd7f6884c5cd..45cf12acb0e26 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1628,6 +1628,7 @@ extern const struct bpf_func_proto 
bpf_get_current_comm_proto;
 extern const struct bpf_func_proto bpf_get_stackid_proto;
 extern const struct bpf_func_proto bpf_get_stack_proto;
 extern const struct bpf_func_proto bpf_get_task_stack_proto;
+extern const struct bpf_func_proto bpf_get_callchain_stackid_proto;
 extern const struct bpf_func_proto bpf_sock_map_update_proto;
 extern const struct bpf_func_proto bpf_sock_hash_update_proto;
 extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 548a749aebb3e..a808accfbd457 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3319,6 +3319,48 @@ union bpf_attr {
  *             A non-negative value equal to or less than *size* on success,
  *             or a negative error in case of failure.
  *
+ * long bpf_get_callchain_stackid(struct perf_callchain_entry *callchain, 
struct bpf_map *map, u64 flags)
+ *     Description
+ *             Walk a user or a kernel stack and return its id. To achieve
+ *             this, the helper needs *callchain*, which is a pointer to a
+ *             valid perf_callchain_entry, and a pointer to a *map* of type
+ *             **BPF_MAP_TYPE_STACK_TRACE**.
+ *
+ *             The last argument, *flags*, holds the number of stack frames to
+ *             skip (from 0 to 255), masked with
+ *             **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
+ *             a combination of the following flags:
+ *
+ *             **BPF_F_USER_STACK**
+ *                     Collect a user space stack instead of a kernel stack.
+ *             **BPF_F_FAST_STACK_CMP**
+ *                     Compare stacks by hash only.
+ *             **BPF_F_REUSE_STACKID**
+ *                     If two different stacks hash into the same *stackid*,
+ *                     discard the old one.
+ *
+ *             The stack id retrieved is a 32 bit long integer handle which
+ *             can be further combined with other data (including other stack
+ *             ids) and used as a key into maps. This can be useful for
+ *             generating a variety of graphs (such as flame graphs or off-cpu
+ *             graphs).
+ *
+ *             For walking a stack, this helper is an improvement over
+ *             **bpf_probe_read**\ (), which can be used with unrolled loops
+ *             but is not efficient and consumes a lot of eBPF instructions.
+ *             Instead, **bpf_get_callchain_stackid**\ () can collect up to
+ *             **PERF_MAX_STACK_DEPTH** both kernel and user frames. Note that
+ *             this limit can be controlled with the **sysctl** program, and
+ *             that it should be manually increased in order to profile long
+ *             user stacks (such as stacks for Java programs). To do so, use:
+ *
+ *             ::
+ *
+ *                     # sysctl kernel.perf_event_max_stack=<new value>
+ *     Return
+ *             The positive or null stack id on success, or a negative error
+ *             in case of failure.
+ *
  */
 #define __BPF_FUNC_MAPPER(FN)          \
        FN(unspec),                     \
@@ -3463,6 +3505,7 @@ union bpf_attr {
        FN(skc_to_tcp_request_sock),    \
        FN(skc_to_udp6_sock),           \
        FN(get_task_stack),             \
+       FN(get_callchain_stackid),      \
        /* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index a6c361ed7937b..28acc610f7f94 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -386,11 +386,10 @@ get_callchain_entry_for_task(struct task_struct *task, 
u32 init_nr)
 #endif
 }
 
-BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
-          u64, flags)
+static long __bpf_get_stackid(struct bpf_map *map, struct perf_callchain_entry 
*trace,
+                             u64 flags)
 {
        struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, 
map);
-       struct perf_callchain_entry *trace;
        struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
        u32 max_depth = map->value_size / stack_map_data_size(map);
        /* stack_map_alloc() checks that max_depth <= 
sysctl_perf_event_max_stack */
@@ -398,21 +397,9 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct 
bpf_map *, map,
        u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
        u32 hash, id, trace_nr, trace_len;
        bool user = flags & BPF_F_USER_STACK;
-       bool kernel = !user;
        u64 *ips;
        bool hash_matches;
 
-       if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
-                              BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
-               return -EINVAL;
-
-       trace = get_perf_callchain(regs, init_nr, kernel, user,
-                                  sysctl_perf_event_max_stack, false, false);
-
-       if (unlikely(!trace))
-               /* couldn't fetch the stack trace */
-               return -EFAULT;
-
        /* get_perf_callchain() guarantees that trace->nr >= init_nr
         * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth
         */
@@ -477,6 +464,30 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct 
bpf_map *, map,
        return id;
 }
 
+BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
+          u64, flags)
+{
+       u32 max_depth = map->value_size / stack_map_data_size(map);
+       /* stack_map_alloc() checks that max_depth <= 
sysctl_perf_event_max_stack */
+       u32 init_nr = sysctl_perf_event_max_stack - max_depth;
+       bool user = flags & BPF_F_USER_STACK;
+       struct perf_callchain_entry *trace;
+       bool kernel = !user;
+
+       if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
+                              BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
+               return -EINVAL;
+
+       trace = get_perf_callchain(regs, init_nr, kernel, user,
+                                  sysctl_perf_event_max_stack, false, false);
+
+       if (unlikely(!trace))
+               /* couldn't fetch the stack trace */
+               return -EFAULT;
+
+       return __bpf_get_stackid(map, trace, flags);
+}
+
 const struct bpf_func_proto bpf_get_stackid_proto = {
        .func           = bpf_get_stackid,
        .gpl_only       = true,
@@ -486,6 +497,28 @@ const struct bpf_func_proto bpf_get_stackid_proto = {
        .arg3_type      = ARG_ANYTHING,
 };
 
+BPF_CALL_3(bpf_get_callchain_stackid, struct perf_callchain_entry *, callchain,
+          struct bpf_map *, map, u64, flags)
+{
+       if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
+                              BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
+               return -EINVAL;
+       if (!callchain)
+               return -EFAULT;
+       return __bpf_get_stackid(map, callchain, flags);
+}
+
+static int bpf_get_callchain_stackid_btf_ids[5];
+const struct bpf_func_proto bpf_get_callchain_stackid_proto = {
+       .func           = bpf_get_callchain_stackid,
+       .gpl_only       = false,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_BTF_ID,
+       .arg2_type      = ARG_CONST_MAP_PTR,
+       .arg3_type      = ARG_ANYTHING,
+       .btf_id         = bpf_get_callchain_stackid_btf_ids,
+};
+
 static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
                            void *buf, u32 size, u64 flags)
 {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 1e11b0f6fba31..07be75550ca93 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4094,7 +4094,8 @@ static int check_map_func_compatibility(struct 
bpf_verifier_env *env,
                        goto error;
                break;
        case BPF_MAP_TYPE_STACK_TRACE:
-               if (func_id != BPF_FUNC_get_stackid)
+               if (func_id != BPF_FUNC_get_stackid &&
+                   func_id != BPF_FUNC_get_callchain_stackid)
                        goto error;
                break;
        case BPF_MAP_TYPE_CGROUP_ARRAY:
@@ -4187,6 +4188,7 @@ static int check_map_func_compatibility(struct 
bpf_verifier_env *env,
                        goto error;
                break;
        case BPF_FUNC_get_stackid:
+       case BPF_FUNC_get_callchain_stackid:
                if (map->map_type != BPF_MAP_TYPE_STACK_TRACE)
                        goto error;
                break;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index c014846c2723c..7a504f734a025 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1396,6 +1396,8 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct 
bpf_prog *prog)
                return &bpf_perf_prog_read_value_proto;
        case BPF_FUNC_read_branch_records:
                return &bpf_read_branch_records_proto;
+       case BPF_FUNC_get_callchain_stackid:
+               return &bpf_get_callchain_stackid_proto;
        default:
                return bpf_tracing_func_proto(func_id, prog);
        }
diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py
index 6843376733df8..1b99e3618e492 100755
--- a/scripts/bpf_helpers_doc.py
+++ b/scripts/bpf_helpers_doc.py
@@ -427,6 +427,7 @@ class PrinterHelpers(Printer):
             'struct tcp_request_sock',
             'struct udp6_sock',
             'struct task_struct',
+            'struct perf_callchain_entry',
 
             'struct __sk_buff',
             'struct sk_msg_md',
@@ -470,6 +471,7 @@ class PrinterHelpers(Printer):
             'struct tcp_request_sock',
             'struct udp6_sock',
             'struct task_struct',
+            'struct perf_callchain_entry',
     }
     mapped_types = {
             'u8': '__u8',
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 548a749aebb3e..a808accfbd457 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -3319,6 +3319,48 @@ union bpf_attr {
  *             A non-negative value equal to or less than *size* on success,
  *             or a negative error in case of failure.
  *
+ * long bpf_get_callchain_stackid(struct perf_callchain_entry *callchain, 
struct bpf_map *map, u64 flags)
+ *     Description
+ *             Walk a user or a kernel stack and return its id. To achieve
+ *             this, the helper needs *callchain*, which is a pointer to a
+ *             valid perf_callchain_entry, and a pointer to a *map* of type
+ *             **BPF_MAP_TYPE_STACK_TRACE**.
+ *
+ *             The last argument, *flags*, holds the number of stack frames to
+ *             skip (from 0 to 255), masked with
+ *             **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
+ *             a combination of the following flags:
+ *
+ *             **BPF_F_USER_STACK**
+ *                     Collect a user space stack instead of a kernel stack.
+ *             **BPF_F_FAST_STACK_CMP**
+ *                     Compare stacks by hash only.
+ *             **BPF_F_REUSE_STACKID**
+ *                     If two different stacks hash into the same *stackid*,
+ *                     discard the old one.
+ *
+ *             The stack id retrieved is a 32 bit long integer handle which
+ *             can be further combined with other data (including other stack
+ *             ids) and used as a key into maps. This can be useful for
+ *             generating a variety of graphs (such as flame graphs or off-cpu
+ *             graphs).
+ *
+ *             For walking a stack, this helper is an improvement over
+ *             **bpf_probe_read**\ (), which can be used with unrolled loops
+ *             but is not efficient and consumes a lot of eBPF instructions.
+ *             Instead, **bpf_get_callchain_stackid**\ () can collect up to
+ *             **PERF_MAX_STACK_DEPTH** both kernel and user frames. Note that
+ *             this limit can be controlled with the **sysctl** program, and
+ *             that it should be manually increased in order to profile long
+ *             user stacks (such as stacks for Java programs). To do so, use:
+ *
+ *             ::
+ *
+ *                     # sysctl kernel.perf_event_max_stack=<new value>
+ *     Return
+ *             The positive or null stack id on success, or a negative error
+ *             in case of failure.
+ *
  */
 #define __BPF_FUNC_MAPPER(FN)          \
        FN(unspec),                     \
@@ -3463,6 +3505,7 @@ union bpf_attr {
        FN(skc_to_tcp_request_sock),    \
        FN(skc_to_udp6_sock),           \
        FN(get_task_stack),             \
+       FN(get_callchain_stackid),      \
        /* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
-- 
2.24.1

[PATCH bpf-next 3/5] bpf: introduce bpf_get_callchain_stackid

Reply via email to