On 4/6/18 2:48 PM, Yonghong Song wrote:
Currently, stackmap and bpf_get_stackid helper are provided
for bpf program to get the stack trace. This approach has
a limitation though. If two stack traces have the same hash,
only one will get stored in the stackmap table,
so some stack traces are missing from user perspective.
This patch implements a new helper, bpf_get_stack, will
send stack traces directly to bpf program. The bpf program
is able to see all stack traces, and then can do in-kernel
processing or send stack traces to user space through
shared map or bpf_perf_event_output.
Signed-off-by: Yonghong Song <y...@fb.com>
---
include/linux/bpf.h | 1 +
include/linux/filter.h | 3 ++-
include/uapi/linux/bpf.h | 17 +++++++++++++--
kernel/bpf/stackmap.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++
kernel/bpf/syscall.c | 12 ++++++++++-
kernel/bpf/verifier.c | 3 +++
kernel/trace/bpf_trace.c | 50 +++++++++++++++++++++++++++++++++++++++++-
7 files changed, 137 insertions(+), 5 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 95a7abd..72ccb9a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -676,6 +676,7 @@ extern const struct bpf_func_proto
bpf_get_current_comm_proto;
extern const struct bpf_func_proto bpf_skb_vlan_push_proto;
extern const struct bpf_func_proto bpf_skb_vlan_pop_proto;
extern const struct bpf_func_proto bpf_get_stackid_proto;
+extern const struct bpf_func_proto bpf_get_stack_proto;
extern const struct bpf_func_proto bpf_sock_map_update_proto;
/* Shared helpers among cBPF and eBPF. */
diff --git a/include/linux/filter.h b/include/linux/filter.h
index fc4e8f9..9b64f63 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -467,7 +467,8 @@ struct bpf_prog {
dst_needed:1, /* Do we need dst entry? */
blinded:1, /* Was blinded */
is_func:1, /* program is a bpf function */
- kprobe_override:1; /* Do we override a kprobe?
*/
+ kprobe_override:1, /* Do we override a kprobe?
*/
+ need_callchain_buf:1; /* Needs callchain
buffer? */
enum bpf_prog_type type; /* Type of BPF program */
enum bpf_attach_type expected_attach_type; /* For some prog types */
u32 len; /* Number of filter blocks */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c5ec897..a4ff5b7 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -517,6 +517,17 @@ union bpf_attr {
* other bits - reserved
* Return: >= 0 stackid on success or negative error
*
+ * int bpf_get_stack(ctx, buf, size, flags)
+ * walk user or kernel stack and store the ips in buf
+ * @ctx: struct pt_regs*
+ * @buf: user buffer to fill stack
+ * @size: the buf size
+ * @flags: bits 0-7 - numer of stack frames to skip
+ * bit 8 - collect user stack instead of kernel
+ * bit 11 - get build-id as well if user stack
+ * other bits - reserved
+ * Return: >= 0 size copied on success or negative error
+ *
* s64 bpf_csum_diff(from, from_size, to, to_size, seed)
* calculate csum diff
* @from: raw from buffer
@@ -821,7 +832,8 @@ union bpf_attr {
FN(msg_apply_bytes), \
FN(msg_cork_bytes), \
FN(msg_pull_data), \
- FN(bind),
+ FN(bind), \
+ FN(get_stack),
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call
@@ -855,11 +867,12 @@ enum bpf_func_id {
/* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */
#define BPF_F_TUNINFO_IPV6 (1ULL << 0)
-/* BPF_FUNC_get_stackid flags. */
+/* BPF_FUNC_get_stackid and BPF_FUNC_get_stack flags. */
#define BPF_F_SKIP_FIELD_MASK 0xffULL
#define BPF_F_USER_STACK (1ULL << 8)
#define BPF_F_FAST_STACK_CMP (1ULL << 9)
#define BPF_F_REUSE_STACKID (1ULL << 10)
+#define BPF_F_USER_BUILD_ID (1ULL << 11)
the comment above is not quite correct.
This new flag is only available for new helper.
/* BPF_FUNC_skb_set_tunnel_key flags. */
#define BPF_F_ZERO_CSUM_TX (1ULL << 1)
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 04f6ec1..371c72e 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -402,6 +402,62 @@ const struct bpf_func_proto bpf_get_stackid_proto = {
.arg3_type = ARG_ANYTHING,
};
+BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
+ u64, flags)
+{
+ u32 init_nr, trace_nr, copy_len, elem_size, num_elem;
+ bool user_build_id = flags & BPF_F_USER_BUILD_ID;
+ u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
+ bool user = flags & BPF_F_USER_STACK;
+ struct perf_callchain_entry *trace;
+ bool kernel = !user;
+ u64 *ips;
+
+ if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
+ BPF_F_USER_BUILD_ID)))
+ return -EINVAL;
+
+ elem_size = (user && user_build_id) ? sizeof(struct bpf_stack_build_id)
+ : sizeof(u64);
+ if (unlikely(size % elem_size))
+ return -EINVAL;
+
+ num_elem = size / elem_size;
+ if (sysctl_perf_event_max_stack < num_elem)
+ init_nr = 0;
prog's buffer should be zero padded in this case since it
points to uninit_mem.
+ else
+ init_nr = sysctl_perf_event_max_stack - num_elem;
+ trace = get_perf_callchain(regs, init_nr, kernel, user,
+ sysctl_perf_event_max_stack, false, false);
+ if (unlikely(!trace))
+ return -EFAULT;
+
+ trace_nr = trace->nr - init_nr;
+ if (trace_nr <= skip)
+ return -EFAULT;
+
+ trace_nr -= skip;
+ trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem;
+ copy_len = trace_nr * elem_size;
+ ips = trace->ip + skip + init_nr;
+ if (user && user_build_id)
the combination of kern + user_build_id should probably be rejected
earlier with einval.
+ stack_map_get_build_id_offset(buf, ips, trace_nr, user);
+ else
+ memcpy(buf, ips, copy_len);
+
+ return copy_len;
+}
+
+const struct bpf_func_proto bpf_get_stack_proto = {
+ .func = bpf_get_stack,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
why allow zero size?
I'm not sure the helper will work correctly when size=0
+ .arg4_type = ARG_ANYTHING,
+};
+
/* Called from eBPF program */
static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
{
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0244973..2aa3a65 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -984,10 +984,13 @@ void bpf_prog_free_id(struct bpf_prog *prog, bool
do_idr_lock)
static void __bpf_prog_put_rcu(struct rcu_head *rcu)
{
struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
+ bool need_callchain_buf = aux->prog->need_callchain_buf;
free_used_maps(aux);
bpf_prog_uncharge_memlock(aux->prog);
security_bpf_prog_free(aux);
+ if (need_callchain_buf)
+ put_callchain_buffers();
bpf_prog_free(aux->prog);
}
@@ -1004,7 +1007,8 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool
do_idr_lock)
bpf_prog_kallsyms_del(prog->aux->func[i]);
bpf_prog_kallsyms_del(prog);
- call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
+ synchronize_rcu();
+ __bpf_prog_put_rcu(&prog->aux->rcu);
there should have been lockdep splat.
We cannot call synchronize_rcu here, since we cannot sleep
in some cases.