This patch's subject should of course be [PATCH V2 1/1] rather than 0/1.
Sorry about that.

On Wed, Jul 10, 2019 at 08:42:24AM -0700, Kris Van Hees wrote:
> This initial implementation of a tiny subset of DTrace functionality
> provides the following options:
> 
>       dtrace [-lvV] [-b bufsz] -s script
>           -b  set trace buffer size
>           -l  list probes (only works with '-s script' for now)
>           -s  enable or list probes for the specified BPF program
>           -V  report DTrace API version
> 
> The patch comprises quite a bit of code due to DTrace requiring a few
> crucial components, even in its most basic form.
> 
> The code is structured around the command line interface implemented in
> dtrace.c.  It provides option parsing and drives the three modes of
> operation that are currently implemented:
> 
> 1. Report DTrace API version information.
>       Report the version information and terminate.
> 
> 2. List probes in BPF programs.
>       Initialize the list of probes that DTrace recognizes, load BPF
>       programs, parse all BPF ELF section names, resolve them into
>       known probes, and emit the probe names.  Then terminate.
> 
> 3. Load BPF programs and collect tracing data.
>       Initialize the list of probes that DTrace recognizes, load BPF
>       programs and attach them to their corresponding probes, set up
>       perf event output buffers, and start processing tracing data.
> 
> This implementation makes extensive use of BPF (handled by dt_bpf.c) and
> the perf event output ring buffer (handled by dt_buffer.c).  DTrace-style
> probe handling (dt_probe.c) offers an interface to probes that hides the
> implementation details of the individual probe types by provider (dt_fbt.c
> and dt_syscall.c).  Probe lookup by name uses a hashtable implementation
> (dt_hash.c).  The dt_utils.c code populates a list of online CPU ids, so
> we know what CPUs we can obtain tracing data from.
> 
> Building the tool is trivial because its only dependency (libbpf) is in
> the kernel tree under tools/lib/bpf.  A simple 'make' in the tools/dtrace
> directory suffices.
> 
> The 'dtrace' executable needs to run as root because BPF programs cannot
> be loaded by non-root users.
> 
> Signed-off-by: Kris Van Hees <kris.van.h...@oracle.com>
> Reviewed-by: David Mc Lean <david.mcl...@oracle.com>
> Reviewed-by: Eugene Loh <eugene....@oracle.com>
> ---
> Changes in v2:
>         - Use ring_buffer_read_head() and ring_buffer_write_tail() to
>           avoid use of volatile.
>         - Handle perf events that wrap around the ring buffer boundary.
>         - Remove unnecessary PERF_EVENT_IOC_ENABLE.
>         - Remove -I$(srctree)/tools/perf from KBUILD_HOSTCFLAGS since it
>           is not actually used.
>         - Use PT_REGS_PARM1(x), etc instead of my own macros.  Adding 
>           PT_REGS_PARM6(x) in bpf_sample.c because we need to be able to
>           support up to 6 arguments passed by registers.
> ---
>  MAINTAINERS                |   6 +
>  tools/dtrace/Makefile      |  87 ++++++++++
>  tools/dtrace/bpf_sample.c  | 146 ++++++++++++++++
>  tools/dtrace/dt_bpf.c      | 185 ++++++++++++++++++++
>  tools/dtrace/dt_buffer.c   | 338 +++++++++++++++++++++++++++++++++++++
>  tools/dtrace/dt_fbt.c      | 201 ++++++++++++++++++++++
>  tools/dtrace/dt_hash.c     | 211 +++++++++++++++++++++++
>  tools/dtrace/dt_probe.c    | 230 +++++++++++++++++++++++++
>  tools/dtrace/dt_syscall.c  | 179 ++++++++++++++++++++
>  tools/dtrace/dt_utils.c    | 132 +++++++++++++++
>  tools/dtrace/dtrace.c      | 249 +++++++++++++++++++++++++++
>  tools/dtrace/dtrace.h      |  13 ++
>  tools/dtrace/dtrace_impl.h | 101 +++++++++++
>  13 files changed, 2078 insertions(+)
>  create mode 100644 tools/dtrace/Makefile
>  create mode 100644 tools/dtrace/bpf_sample.c
>  create mode 100644 tools/dtrace/dt_bpf.c
>  create mode 100644 tools/dtrace/dt_buffer.c
>  create mode 100644 tools/dtrace/dt_fbt.c
>  create mode 100644 tools/dtrace/dt_hash.c
>  create mode 100644 tools/dtrace/dt_probe.c
>  create mode 100644 tools/dtrace/dt_syscall.c
>  create mode 100644 tools/dtrace/dt_utils.c
>  create mode 100644 tools/dtrace/dtrace.c
>  create mode 100644 tools/dtrace/dtrace.h
>  create mode 100644 tools/dtrace/dtrace_impl.h
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index cfa9ed89c031..410240732d55 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -5485,6 +5485,12 @@ W:     https://linuxtv.org
>  S:   Odd Fixes
>  F:   drivers/media/pci/dt3155/
>  
> +DTRACE
> +M:   Kris Van Hees <kris.van.h...@oracle.com>
> +L:   dtrace-de...@oss.oracle.com
> +S:   Maintained
> +F:   tools/dtrace/
> +
>  DVB_USB_AF9015 MEDIA DRIVER
>  M:   Antti Palosaari <cr...@iki.fi>
>  L:   linux-me...@vger.kernel.org
> diff --git a/tools/dtrace/Makefile b/tools/dtrace/Makefile
> new file mode 100644
> index 000000000000..03ae498d1429
> --- /dev/null
> +++ b/tools/dtrace/Makefile
> @@ -0,0 +1,87 @@
> +# SPDX-License-Identifier: GPL-2.0
> +#
> +# This Makefile is based on samples/bpf.
> +#
> +# Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
> +
> +DT_VERSION           := 2.0.0
> +DT_GIT_VERSION               := $(shell git rev-parse HEAD 2>/dev/null || \
> +                                echo Unknown)
> +
> +DTRACE_PATH          ?= $(abspath $(srctree)/$(src))
> +TOOLS_PATH           := $(DTRACE_PATH)/..
> +SAMPLES_PATH         := $(DTRACE_PATH)/../../samples
> +
> +hostprogs-y          := dtrace
> +
> +LIBBPF                       := $(TOOLS_PATH)/lib/bpf/libbpf.a
> +OBJS                 := dt_bpf.o dt_buffer.o dt_utils.o dt_probe.o \
> +                        dt_hash.o \
> +                        dt_fbt.o dt_syscall.o
> +
> +dtrace-objs          := $(OBJS) dtrace.o
> +
> +always                       := $(hostprogs-y)
> +always                       += bpf_sample.o
> +
> +KBUILD_HOSTCFLAGS    += -DDT_VERSION=\"$(DT_VERSION)\"
> +KBUILD_HOSTCFLAGS    += -DDT_GIT_VERSION=\"$(DT_GIT_VERSION)\"
> +KBUILD_HOSTCFLAGS    += -I$(srctree)/tools/lib
> +KBUILD_HOSTCFLAGS    += -I$(srctree)/tools/include/uapi
> +KBUILD_HOSTCFLAGS    += -I$(srctree)/tools/include/
> +KBUILD_HOSTCFLAGS    += -I$(srctree)/usr/include
> +
> +KBUILD_HOSTLDLIBS    := $(LIBBPF) -lelf
> +
> +LLC                  ?= llc
> +CLANG                        ?= clang
> +LLVM_OBJCOPY         ?= llvm-objcopy
> +
> +ifdef CROSS_COMPILE
> +HOSTCC                       = $(CROSS_COMPILE)gcc
> +CLANG_ARCH_ARGS              = -target $(ARCH)
> +endif
> +
> +all:
> +     $(MAKE) -C ../../ $(CURDIR)/ DTRACE_PATH=$(CURDIR)
> +
> +clean:
> +     $(MAKE) -C ../../ M=$(CURDIR) clean
> +     @rm -f *~
> +
> +$(LIBBPF): FORCE
> +     $(MAKE) -C $(dir $@) RM='rm -rf' LDFLAGS= srctree=$(DTRACE_PATH)/../../ 
> O=
> +
> +FORCE:
> +
> +.PHONY: verify_cmds verify_target_bpf $(CLANG) $(LLC)
> +
> +verify_cmds: $(CLANG) $(LLC)
> +     @for TOOL in $^ ; do \
> +             if ! (which -- "$${TOOL}" > /dev/null 2>&1); then \
> +                     echo "*** ERROR: Cannot find LLVM tool $${TOOL}" ;\
> +                     exit 1; \
> +             else true; fi; \
> +     done
> +
> +verify_target_bpf: verify_cmds
> +     @if ! (${LLC} -march=bpf -mattr=help > /dev/null 2>&1); then \
> +             echo "*** ERROR: LLVM (${LLC}) does not support 'bpf' target" ;\
> +             echo "   NOTICE: LLVM version >= 3.7.1 required" ;\
> +             exit 2; \
> +     else true; fi
> +
> +$(DTRACE_PATH)/*.c: verify_target_bpf $(LIBBPF)
> +$(src)/*.c: verify_target_bpf $(LIBBPF)
> +
> +$(obj)/%.o: $(src)/%.c
> +     @echo "  CLANG-bpf " $@
> +     $(Q)$(CLANG) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(EXTRA_CFLAGS) -I$(obj) 
> \
> +             -I$(srctree)/tools/testing/selftests/bpf/ \
> +             -D__KERNEL__ -D__BPF_TRACING__ -Wno-unused-value 
> -Wno-pointer-sign \
> +             -D__TARGET_ARCH_$(ARCH) -Wno-compare-distinct-pointer-types \
> +             -Wno-gnu-variable-sized-type-not-at-end \
> +             -Wno-address-of-packed-member -Wno-tautological-compare \
> +             -Wno-unknown-warning-option $(CLANG_ARCH_ARGS) \
> +             -I$(srctree)/samples/bpf/ -include asm_goto_workaround.h \
> +             -O2 -emit-llvm -c $< -o -| $(LLC) -march=bpf $(LLC_FLAGS) 
> -filetype=obj -o $@
> diff --git a/tools/dtrace/bpf_sample.c b/tools/dtrace/bpf_sample.c
> new file mode 100644
> index 000000000000..9862f75f92d3
> --- /dev/null
> +++ b/tools/dtrace/bpf_sample.c
> @@ -0,0 +1,146 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * This sample DTrace BPF tracing program demonstrates how actions can be
> + * associated with different probe types.
> + *
> + * The kprobe/ksys_write probe is a Function Boundary Tracing (FBT) entry 
> probe
> + * on the ksys_write(fd, buf, count) function in the kernel.  Arguments to 
> the
> + * function can be retrieved from the CPU registers (struct pt_regs).
> + *
> + * The tracepoint/syscalls/sys_enter_write probe is a System Call entry probe
> + * for the write(d, buf, count) system call.  Arguments to the system call 
> can
> + * be retrieved from the tracepoint data passed to the BPF program as context
> + * struct syscall_data) when the probe fires.
> + *
> + * The BPF program associated with each probe prepares a DTrace BPF context
> + * (struct dt_bpf_context) that stores the probe ID and up to 10 arguments.
> + * Only 3 arguments are used in this sample.  Then the prorgams call a shared
> + * BPF function (bpf_action) that implements the actual action to be taken 
> when
> + * a probe fires.  It prepares a data record to be stored in the tracing 
> buffer
> + * and submits it to the buffer.  The data in the data record is obtained 
> from
> + * the DTrace BPF context.
> + *
> + * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
> + */
> +#include <uapi/linux/bpf.h>
> +#include <linux/ptrace.h>
> +#include <linux/version.h>
> +#include <uapi/linux/unistd.h>
> +#include "bpf_helpers.h"
> +
> +#include "dtrace.h"
> +
> +struct syscall_data {
> +     struct pt_regs *regs;
> +     long syscall_nr;
> +     long arg[6];
> +};
> +
> +struct bpf_map_def SEC("maps") buffers = {
> +     .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
> +     .key_size = sizeof(u32),
> +     .value_size = sizeof(u32),
> +     .max_entries = NR_CPUS,
> +};
> +
> +#if defined(bpf_target_x86)
> +# define PT_REGS_PARM6(x)    ((x)->r9)
> +#elif defined(bpf_target_s390x)
> +# define PT_REGS_PARM6(x)    ((x)->gprs[7])
> +#elif defined(bpf_target_arm)
> +# define PT_REGS_PARM6(x)    ((x)->uregs[5])
> +#elif defined(bpf_target_arm64)
> +# define PT_REGS_PARM6(x)    ((x)->regs[5])
> +#elif defined(bpf_target_mips)
> +# define PT_REGS_PARM6(x)    ((x)->regs[9])
> +#elif defined(bpf_target_powerpc)
> +# define PT_REGS_PARM6(x)    ((x)->gpr[8])
> +#elif defined(bpf_target_sparc)
> +# define PT_REGS_PARM6(x)    ((x)->u_regs[UREG_I5])
> +#else
> +# error Argument retrieval from pt_regs is not supported yet on this arch.
> +#endif
> +
> +/*
> + * We must pass a valid BPF context pointer because the 
> bpf_perf_event_output()
> + * helper requires a BPF context pointer as first argument (and the verifier 
> is
> + * validating that we pass a value that is known to be a context pointer).
> + *
> + * This BPF function implements the following D action:
> + * {
> + *   trace(curthread);
> + *   trace(arg0);
> + *   trace(arg1);
> + *   trace(arg2);
> + * }
> + *
> + * Expected output will look like:
> + *   CPU     ID
> + *    15  70423 0xffff8c0968bf8ec0 0x00000000000001 0x0055e019eb3f60 
> 0x0000000000002c
> + *    15  18876 0xffff8c0968bf8ec0 0x00000000000001 0x0055e019eb3f60 
> 0x0000000000002c
> + *    |   |     +-- curthread      +--> arg0 (fd)   +--> arg1 (buf)  +-- 
> arg2 (count)
> + *    |   |
> + *    |   +--> probe ID
> + *    |
> + *    +--> CPU the probe fired on
> + */
> +static noinline int bpf_action(void *bpf_ctx, struct dt_bpf_context *ctx)
> +{
> +     int                     cpu = bpf_get_smp_processor_id();
> +     struct data {
> +             u32     probe_id;       /* mandatory */
> +
> +             u64     task;           /* first data item (current task) */
> +             u64     arg0;           /* 2nd data item (arg0, fd) */
> +             u64     arg1;           /* 3rd data item (arg1, buf) */
> +             u64     arg2;           /* 4th data item (arg2, count) */
> +     }                       rec;
> +
> +     memset(&rec, 0, sizeof(rec));
> +
> +     rec.probe_id = ctx->probe_id;
> +     rec.task = bpf_get_current_task();
> +     rec.arg0 = ctx->argv[0];
> +     rec.arg1 = ctx->argv[1];
> +     rec.arg2 = ctx->argv[2];
> +
> +     bpf_perf_event_output(bpf_ctx, &buffers, cpu, &rec, sizeof(rec));
> +
> +     return 0;
> +}
> +
> +SEC("kprobe/ksys_write")
> +int bpf_kprobe(struct pt_regs *regs)
> +{
> +     struct dt_bpf_context   ctx;
> +
> +     memset(&ctx, 0, sizeof(ctx));
> +
> +     ctx.probe_id = 18876;
> +     ctx.argv[0] = PT_REGS_PARM1(regs);
> +     ctx.argv[1] = PT_REGS_PARM2(regs);
> +     ctx.argv[2] = PT_REGS_PARM3(regs);
> +     ctx.argv[3] = PT_REGS_PARM4(regs);
> +     ctx.argv[4] = PT_REGS_PARM5(regs);
> +     ctx.argv[5] = PT_REGS_PARM6(regs);
> +
> +     return bpf_action(regs, &ctx);
> +}
> +
> +SEC("tracepoint/syscalls/sys_enter_write")
> +int bpf_tp(struct syscall_data *scd)
> +{
> +     struct dt_bpf_context   ctx;
> +
> +     memset(&ctx, 0, sizeof(ctx));
> +
> +     ctx.probe_id = 70423;
> +     ctx.argv[0] = scd->arg[0];
> +     ctx.argv[1] = scd->arg[1];
> +     ctx.argv[2] = scd->arg[2];
> +
> +     return bpf_action(scd, &ctx);
> +}
> +
> +char _license[] SEC("license") = "GPL";
> +u32 _version SEC("version") = LINUX_VERSION_CODE;
> diff --git a/tools/dtrace/dt_bpf.c b/tools/dtrace/dt_bpf.c
> new file mode 100644
> index 000000000000..78c90de016c6
> --- /dev/null
> +++ b/tools/dtrace/dt_bpf.c
> @@ -0,0 +1,185 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * This file provides the interface for handling BPF.  It uses the bpf 
> library
> + * to interact with BPF ELF object files.
> + *
> + * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
> + */
> +#include <errno.h>
> +#include <stdarg.h>
> +#include <stdio.h>
> +#include <string.h>
> +#include <unistd.h>
> +#include <bpf/libbpf.h>
> +#include <linux/kernel.h>
> +#include <linux/perf_event.h>
> +#include <sys/ioctl.h>
> +
> +#include "dtrace_impl.h"
> +
> +/*
> + * Validate the output buffer map that is specified in the BPF ELF object.  
> It
> + * must match the following definition to be valid:
> + *
> + * struct bpf_map_def SEC("maps") buffers = {
> + *   .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
> + *   .key_size = sizeof(u32),
> + *   .value_size = sizeof(u32),
> + *   .max_entries = num,
> + * };
> + * where num is greater than dt_maxcpuid.
> + */
> +static int is_valid_buffers(const struct bpf_map_def *mdef)
> +{
> +     return mdef->type == BPF_MAP_TYPE_PERF_EVENT_ARRAY &&
> +            mdef->key_size == sizeof(u32) &&
> +            mdef->value_size == sizeof(u32) &&
> +            mdef->max_entries > dt_maxcpuid;
> +}
> +
> +/*
> + * List the probes specified in the given BPF ELF object file.
> + */
> +int dt_bpf_list_probes(const char *fn)
> +{
> +     struct bpf_object       *obj;
> +     struct bpf_program      *prog;
> +     int                     rc, fd;
> +
> +     libbpf_set_print(NULL);
> +
> +     /*
> +      * Listing probes is done before the DTrace command line utility loads
> +      * the supplied programs.  We load them here without attaching them to
> +      * probes so that we can retrieve the ELF section names for each BPF
> +      * program.  The section name indicates the probe that the program is
> +      * associated with.
> +      */
> +     rc = bpf_prog_load(fn, BPF_PROG_TYPE_UNSPEC, &obj, &fd);
> +     if (rc)
> +             return rc;
> +
> +     /*
> +      * Loop through the programs in the BPF ELF object, and try to resolve
> +      * the section names into probes.  Use the supplied callback function
> +      * to emit the probe description.
> +      */
> +     for (prog = bpf_program__next(NULL, obj); prog != NULL;
> +          prog = bpf_program__next(prog, obj)) {
> +             struct dt_probe *probe;
> +
> +             probe = dt_probe_resolve_event(bpf_program__title(prog, false));
> +
> +             printf("%5d %10s %17s %33s %s\n", probe->id,
> +                    probe->prv_name ? probe->prv_name : "",
> +                    probe->mod_name ? probe->mod_name : "",
> +                    probe->fun_name ? probe->fun_name : "",
> +                    probe->prb_name ? probe->prb_name : "");
> +     }
> +
> +
> +     /* Done with the BPF ELF object.  */
> +     bpf_object__close(obj);
> +
> +     return 0;
> +}
> +
> +/*
> + * Load the given BPF ELF object file.
> + */
> +int dt_bpf_load_file(const char *fn)
> +{
> +     struct bpf_object       *obj;
> +     struct bpf_map          *map;
> +     struct bpf_program      *prog;
> +     int                     rc, fd;
> +
> +     libbpf_set_print(NULL);
> +
> +     /* Load the BPF ELF object file. */
> +     rc = bpf_prog_load(fn, BPF_PROG_TYPE_UNSPEC, &obj, &fd);
> +     if (rc)
> +             return rc;
> +
> +     /* Validate buffers map. */
> +     map = bpf_object__find_map_by_name(obj, "buffers");
> +     if (map && is_valid_buffers(bpf_map__def(map)))
> +             dt_bufmap_fd = bpf_map__fd(map);
> +     else
> +             goto fail;
> +
> +     /*
> +      * Loop through the programs and resolve each into the matching probe.
> +      * Attach the program to the probe.
> +      */
> +     for (prog = bpf_program__next(NULL, obj); prog != NULL;
> +          prog = bpf_program__next(prog, obj)) {
> +             struct dt_probe *probe;
> +
> +             probe = dt_probe_resolve_event(bpf_program__title(prog, false));
> +             if (!probe)
> +                     return -ENOENT;
> +             if (probe->prov && probe->prov->attach)
> +                     probe->prov->attach(bpf_program__title(prog, false),
> +                                         bpf_program__fd(prog));
> +     }
> +
> +     return 0;
> +
> +fail:
> +     bpf_object__close(obj);
> +     return -EINVAL;
> +}
> +
> +/*
> + * Store the (key, value) pair in the map referenced by the given fd.
> + */
> +int dt_bpf_map_update(int fd, const void *key, const void *val)
> +{
> +     union bpf_attr  attr;
> +
> +     memset(&attr, 0, sizeof(attr));
> +
> +     attr.map_fd = fd;
> +     attr.key = (u64)(unsigned long)key;
> +     attr.value = (u64)(unsigned long)val;
> +     attr.flags = 0;
> +
> +     return bpf(BPF_MAP_UPDATE_ELEM, &attr);
> +}
> +
> +/*
> + * Attach a trace event and associate a BPF program with it.
> + */
> +int dt_bpf_attach(int event_id, int bpf_fd)
> +{
> +     int                     event_fd;
> +     int                     rc;
> +     struct perf_event_attr  attr = {};
> +
> +     attr.type = PERF_TYPE_TRACEPOINT;
> +     attr.sample_type = PERF_SAMPLE_RAW;
> +     attr.sample_period = 1;
> +     attr.wakeup_events = 1;
> +     attr.config = event_id;
> +
> +     /*
> +      * Register the event (based on its id), and obtain a fd.  It gets
> +      * created as an enabled probe, so we don't have to explicitly enable
> +      * it.
> +      */
> +     event_fd = perf_event_open(&attr, -1, 0, -1, 0);
> +     if (event_fd < 0) {
> +             perror("sys_perf_event_open");
> +             return -1;
> +     }
> +
> +     /* Associate the BPF program with the event. */
> +     rc = ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, bpf_fd);
> +     if (rc < 0) {
> +             perror("PERF_EVENT_IOC_SET_BPF");
> +             return -1;
> +     }
> +
> +     return 0;
> +}
> diff --git a/tools/dtrace/dt_buffer.c b/tools/dtrace/dt_buffer.c
> new file mode 100644
> index 000000000000..19bb7e4cfc92
> --- /dev/null
> +++ b/tools/dtrace/dt_buffer.c
> @@ -0,0 +1,338 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * This file provides the tracing buffer handling for DTrace.  It makes use 
> of
> + * the perf event output ring buffers that can be written to from BPF 
> programs.
> + *
> + * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
> + */
> +#include <errno.h>
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <syscall.h>
> +#include <unistd.h>
> +#include <sys/epoll.h>
> +#include <sys/ioctl.h>
> +#include <sys/mman.h>
> +#include <linux/bpf.h>
> +#include <linux/perf_event.h>
> +#include <linux/ring_buffer.h>
> +
> +#include "dtrace_impl.h"
> +
> +/*
> + * Probe data is recorded in per-CPU perf ring buffers.
> + */
> +struct dtrace_buffer {
> +     int     cpu;                    /* ID of CPU that uses this buffer */
> +     int     fd;                     /* fd of perf output buffer */
> +     size_t  page_size;              /* size of each page in buffer */
> +     size_t  data_size;              /* total buffer size */
> +     u8      *base;                  /* address of buffer */
> +     u8      *endp;                  /* address of end of buffer */
> +     u8      *tmp;                   /* temporary event buffer */
> +     u32     tmp_len;                /* length of temporary event buffer */
> +};
> +
> +static struct dtrace_buffer  *dt_buffers;
> +
> +/*
> + * File descriptor for the BPF map that holds the buffers for the online 
> CPUs.
> + * The map is a bpf_array indexed by CPU id, and it stores a file descriptor 
> as
> + * value (the fd for the perf_event that represents the CPU buffer).
> + */
> +int                          dt_bufmap_fd = -1;
> +
> +/*
> + * Create a perf_event buffer for the given DTrace buffer.  This will create
> + * a perf_event ring_buffer, mmap it, and enable the perf_event that owns the
> + * buffer.
> + */
> +static int perf_buffer_open(struct dtrace_buffer *buf)
> +{
> +     int                     pefd;
> +     struct perf_event_attr  attr = {};
> +
> +     /*
> +      * Event configuration for BPF-generated output in perf_event ring
> +      * buffers.  The event is created in enabled state.
> +      */
> +     attr.config = PERF_COUNT_SW_BPF_OUTPUT;
> +     attr.type = PERF_TYPE_SOFTWARE;
> +     attr.sample_type = PERF_SAMPLE_RAW;
> +     attr.sample_period = 1;
> +     attr.wakeup_events = 1;
> +     pefd = perf_event_open(&attr, -1, buf->cpu, -1, PERF_FLAG_FD_CLOEXEC);
> +     if (pefd < 0) {
> +             fprintf(stderr, "perf_event_open(cpu %d): %s\n", buf->cpu,
> +                     strerror(errno));
> +             goto fail;
> +     }
> +
> +     /*
> +      * We add buf->page_size to the buf->data_size, because perf maintains
> +      * a meta-data page at the beginning of the memory region.  That page
> +      * is used for reader/writer symchronization.
> +      */
> +     buf->fd = pefd;
> +     buf->base = mmap(NULL, buf->page_size + buf->data_size,
> +                      PROT_READ | PROT_WRITE, MAP_SHARED, buf->fd, 0);
> +     buf->endp = buf->base + buf->page_size + buf->data_size - 1;
> +     if (!buf->base)
> +             goto fail;
> +
> +     return 0;
> +
> +fail:
> +     if (buf->base) {
> +             munmap(buf->base, buf->page_size + buf->data_size);
> +             buf->base = NULL;
> +             buf->endp = NULL;
> +     }
> +     if (buf->fd) {
> +             close(buf->fd);
> +             buf->fd = -1;
> +     }
> +
> +     return -1;
> +}
> +
> +/*
> + * Close the given DTrace buffer.  This function disables the perf_event that
> + * owns the buffer, munmaps the memory space, and closes the perf buffer fd.
> + */
> +static void perf_buffer_close(struct dtrace_buffer *buf)
> +{
> +     /*
> +      * If the perf buffer failed to open, there is no need to close it.
> +      */
> +     if (buf->fd < 0)
> +             return;
> +
> +     if (ioctl(buf->fd, PERF_EVENT_IOC_DISABLE, 0) < 0)
> +             fprintf(stderr, "PERF_EVENT_IOC_DISABLE(cpu %d): %s\n",
> +                     buf->cpu, strerror(errno));
> +
> +     munmap(buf->base, buf->page_size + buf->data_size);
> +
> +     if (close(buf->fd))
> +             fprintf(stderr, "perf buffer close(cpu %d): %s\n",
> +                     buf->cpu, strerror(errno));
> +
> +     buf->base = NULL;
> +     buf->fd = -1;
> +}
> +
> +/*
> + * Initialize the probe data buffers (one per online CPU).  Each buffer will
> + * contain the given number of pages (i.e. total size of each buffer will be
> + * num_pages * getpagesize()).  This function also sets up an event polling
> + * descriptor that monitors all CPU buffers at once.
> + */
> +int dt_buffer_init(int num_pages)
> +{
> +     int     i;
> +     int     epoll_fd;
> +
> +     if (dt_bufmap_fd < 0)
> +             return -EINVAL;
> +
> +     /* Allocate the per-CPU buffer structs. */
> +     dt_buffers = calloc(dt_numcpus, sizeof(struct dtrace_buffer));
> +     if (dt_buffers == NULL)
> +             return -ENOMEM;
> +
> +     /* Set up the event polling file descriptor. */
> +     epoll_fd = epoll_create1(EPOLL_CLOEXEC);
> +     if (epoll_fd < 0) {
> +             free(dt_buffers);
> +             return -errno;
> +     }
> +
> +     for (i = 0; i < dt_numcpus; i++) {
> +             int                     cpu = dt_cpuids[i];
> +             struct epoll_event      ev;
> +             struct dtrace_buffer    *buf = &dt_buffers[i];
> +
> +             buf->cpu = cpu;
> +             buf->page_size = getpagesize();
> +             buf->data_size = num_pages * buf->page_size;
> +             buf->tmp = NULL;
> +             buf->tmp_len = 0;
> +
> +             /* Try to create the perf buffer for this DTrace buffer. */
> +             if (perf_buffer_open(buf) == -1)
> +                     continue;
> +
> +             /* Store the perf buffer fd in the buffer map. */
> +             dt_bpf_map_update(dt_bufmap_fd, &cpu, &buf->fd);
> +
> +             /* Add the buffer to the event polling descriptor. */
> +             ev.events = EPOLLIN;
> +             ev.data.ptr = buf;
> +             if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, buf->fd, &ev) == -1) {
> +                     fprintf(stderr, "EPOLL_CTL_ADD(cpu %d): %s\n",
> +                             buf->cpu, strerror(errno));
> +                     continue;
> +             }
> +     }
> +
> +     return epoll_fd;
> +}
> +
> +/*
> + * Clean up the buffers.
> + */
> +void dt_buffer_exit(int epoll_fd)
> +{
> +     int     i;
> +
> +     for (i = 0; i < dt_numcpus; i++)
> +             perf_buffer_close(&dt_buffers[i]);
> +
> +     free(dt_buffers);
> +     close(epoll_fd);
> +}
> +
> +/*
> + * Process and output the probe data at the supplied address.
> + */
> +static void output_event(int cpu, u64 *buf)
> +{
> +     u8                              *data = (u8 *)buf;
> +     struct perf_event_header        *hdr;
> +
> +     hdr = (struct perf_event_header *)data;
> +     data += sizeof(struct perf_event_header);
> +
> +     if (hdr->type == PERF_RECORD_SAMPLE) {
> +             u8              *ptr = data;
> +             u32             i, size, probe_id;
> +
> +             /*
> +              * struct {
> +              *      struct perf_event_header        header;
> +              *      u32                             size;
> +              *      u32                             probe_id;
> +              *      u32                             gap;
> +              *      u64                             data[n];
> +              * }
> +              * and data points to the 'size' member at this point.
> +              */
> +             if (ptr > (u8 *)buf + hdr->size) {
> +                     fprintf(stderr, "BAD: corrupted sample header\n");
> +                     return;
> +             }
> +
> +             size = *(u32 *)data;
> +             data += sizeof(size);
> +             ptr += sizeof(size) + size;
> +             if (ptr != (u8 *)buf + hdr->size) {
> +                     fprintf(stderr, "BAD: invalid sample size\n");
> +                     return;
> +             }
> +
> +             probe_id = *(u32 *)data;
> +             data += sizeof(probe_id);
> +             size -= sizeof(probe_id);
> +             data += sizeof(u32);            /* skip 32-bit gap */
> +             size -= sizeof(u32);
> +             buf = (u64 *)data;
> +
> +             printf("%3d %6d ", cpu, probe_id);
> +             for (i = 0, size /= sizeof(u64); i < size; i++)
> +                     printf("%#016lx ", buf[i]);
> +             printf("\n");
> +     } else if (hdr->type == PERF_RECORD_LOST) {
> +             u64     lost;
> +
> +             /*
> +              * struct {
> +              *      struct perf_event_header        header;
> +              *      u64                             id;
> +              *      u64                             lost;
> +              * }
> +              * and data points to the 'id' member at this point.
> +              */
> +             lost = *(u64 *)(data + sizeof(u64));
> +
> +             printf("[%ld probes dropped]\n", lost);
> +     } else
> +             fprintf(stderr, "UNKNOWN: record type %d\n", hdr->type);
> +}
> +
> +/*
> + * Process the available probe data in the given buffer.
> + */
> +static void process_data(struct dtrace_buffer *buf)
> +{
> +     struct perf_event_mmap_page     *rb_page = (void *)buf->base;
> +     struct perf_event_header        *hdr;
> +     u8                              *base;
> +     u64                             head, tail;
> +
> +     /* Set base to be the start of the buffer data. */
> +     base = buf->base + buf->page_size;
> +
> +     for (;;) {
> +             head = ring_buffer_read_head(rb_page);
> +             tail = rb_page->data_tail;
> +
> +             if (tail == head)
> +                     break;
> +
> +             do {
> +                     u8      *event = base + tail % buf->data_size;
> +                     u32     len;
> +
> +                     hdr = (struct perf_event_header *)event;
> +                     len = hdr->size;
> +
> +                     /*
> +                      * If the perf event data wraps around the boundary of
> +                      * the buffer, we make a copy in contiguous memory.
> +                      */
> +                     if (event + len > buf->endp) {
> +                             u8      *dst;
> +                             u32     num;
> +
> +                             /* Increase buffer as needed. */
> +                             if (buf->tmp_len < len) {
> +                                     buf->tmp = realloc(buf->tmp, len);
> +                                     buf->tmp_len = len;
> +                             }
> +
> +                             dst = buf->tmp;
> +                             num = buf->endp - event + 1;
> +                             memcpy(dst, event, num);
> +                             memcpy(dst + num, base, len - num);
> +
> +                             event = dst;
> +                     }
> +
> +                     output_event(buf->cpu, (u64 *)event);
> +
> +                     tail += hdr->size;
> +             } while (tail != head);
> +
> +             ring_buffer_write_tail(rb_page, tail);
> +     }
> +}
> +
> +/*
> + * Wait for data to become available in any of the buffers.
> + */
> +int dt_buffer_poll(int epoll_fd, int timeout)
> +{
> +     struct epoll_event      events[dt_numcpus];
> +     int                     i, cnt;
> +
> +     cnt = epoll_wait(epoll_fd, events, dt_numcpus, timeout);
> +     if (cnt < 0)
> +             return -errno;
> +
> +     for (i = 0; i < cnt; i++)
> +             process_data((struct dtrace_buffer *)events[i].data.ptr);
> +
> +     return cnt;
> +}
> diff --git a/tools/dtrace/dt_fbt.c b/tools/dtrace/dt_fbt.c
> new file mode 100644
> index 000000000000..fcf95243bf97
> --- /dev/null
> +++ b/tools/dtrace/dt_fbt.c
> @@ -0,0 +1,201 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * The Function Boundary Tracing (FBT) provider for DTrace.
> + *
> + * FBT probes are exposed by the kernel as kprobes.  They are listed in the
> + * TRACEFS/available_filter_functions file.  Some kprobes are associated with
> + * a specific kernel module, while most are in the core kernel.
> + *
> + * Mapping from event name to DTrace probe name:
> + *
> + *      <name>                                       fbt:vmlinux:<name>:entry
> + *                                           fbt:vmlinux:<name>:return
> + *   or
> + *      <name> [<modname>]                   fbt:<modname>:<name>:entry
> + *                                           fbt:<modname>:<name>:return
> + *
> + * Mapping from BPF section name to DTrace probe name:
> + *
> + *      kprobe/<name>                                fbt:vmlinux:<name>:entry
> + *      kretprobe/<name>                     fbt:vmlinux:<name>:return
> + *
> + * (Note that the BPF section does not carry information about the module 
> that
> + *  the function is found in.  This means that BPF section name cannot be 
> used
> + *  to distinguish between functions with the same name occurring in 
> different
> + *  modules.)
> + *
> + * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
> + */
> +#include <fcntl.h>
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <unistd.h>
> +#include <linux/bpf.h>
> +#include <sys/stat.h>
> +#include <sys/types.h>
> +
> +#include "dtrace_impl.h"
> +
> +#define KPROBE_EVENTS        TRACEFS "kprobe_events"
> +#define PROBE_LIST   TRACEFS "available_filter_functions"
> +
> +static const char    provname[] = "fbt";
> +static const char    modname[] = "vmlinux";
> +
> +/*
> + * Scan the PROBE_LIST file and add entry and return probes for every 
> function
> + * that is listed.
> + */
> +static int fbt_populate(void)
> +{
> +     FILE                    *f;
> +     char                    buf[256];
> +     char                    *p;
> +
> +     f = fopen(PROBE_LIST, "r");
> +     if (f == NULL)
> +             return -1;
> +
> +     while (fgets(buf, sizeof(buf), f)) {
> +             /*
> +              * Here buf is either "funcname\n" or "funcname [modname]\n".
> +              */
> +             p = strchr(buf, '\n');
> +             if (p) {
> +                     *p = '\0';
> +                     if (p > buf && *(--p) == ']')
> +                             *p = '\0';
> +             } else {
> +                     /* If we didn't see a newline, the line was too long.
> +                      * Report it, and continue until the end of the line.
> +                      */
> +                     fprintf(stderr, "%s: Line too long: %s\n",
> +                             PROBE_LIST, buf);
> +                     do
> +                             fgets(buf, sizeof(buf), f);
> +                     while (strchr(buf, '\n') == NULL);
> +                     continue;
> +             }
> +
> +             /*
> +              * Now buf is either "funcname" or "funcname [modname".  If
> +              * there is no module name provided, we will use the default.
> +              */
> +             p = strchr(buf, ' ');
> +             if (p) {
> +                     *p++ = '\0';
> +                     if (*p == '[')
> +                             p++;
> +             }
> +
> +             dt_probe_new(&dt_fbt, provname, p ? p : modname, buf, "entry");
> +             dt_probe_new(&dt_fbt, provname, p ? p : modname, buf, "return");
> +     }
> +
> +     fclose(f);
> +
> +     return 0;
> +}
> +
> +#define ENTRY_PREFIX "kprobe/"
> +#define EXIT_PREFIX  "kretprobe/"
> +
> +/*
> + * Perform a probe lookup based on an event name (BPF ELF section name).
> + */
> +static struct dt_probe *fbt_resolve_event(const char *name)
> +{
> +     const char      *prbname;
> +     struct dt_probe tmpl;
> +     struct dt_probe *probe;
> +
> +     if (!name)
> +             return NULL;
> +
> +     if (strncmp(name, ENTRY_PREFIX, sizeof(ENTRY_PREFIX) - 1) == 0) {
> +             name += sizeof(ENTRY_PREFIX) - 1;
> +             prbname = "entry";
> +     } else if (strncmp(name, EXIT_PREFIX, sizeof(EXIT_PREFIX) - 1) == 0) {
> +             name += sizeof(EXIT_PREFIX) - 1;
> +             prbname = "return";
> +     } else
> +             return NULL;
> +
> +     memset(&tmpl, 0, sizeof(tmpl));
> +     tmpl.prv_name = provname;
> +     tmpl.mod_name = modname;
> +     tmpl.fun_name = name;
> +     tmpl.prb_name = prbname;
> +
> +     probe = dt_probe_by_name(&tmpl);
> +
> +     return probe;
> +}
> +
> +/*
> + * Attach the given BPF program (identified by its file descriptor) to the
> + * kprobe identified by the given section name.
> + */
> +static int fbt_attach(const char *name, int bpf_fd)
> +{
> +     char    efn[256];
> +     char    buf[256];
> +     int     event_id, fd, rc;
> +
> +     name += 7;                              /* skip "kprobe/" */
> +     snprintf(buf, sizeof(buf), "p:%s %s\n", name, name);
> +
> +     /*
> +      * Register the kprobe with the tracing subsystem.  This will create
> +      * a tracepoint event.
> +      */
> +     fd = open(KPROBE_EVENTS, O_WRONLY | O_APPEND);
> +     if (fd < 0) {
> +             perror(KPROBE_EVENTS);
> +             return -1;
> +     }
> +     rc = write(fd, buf, strlen(buf));
> +     if (rc < 0) {
> +             perror(KPROBE_EVENTS);
> +             close(fd);
> +             return -1;
> +     }
> +     close(fd);
> +
> +     /*
> +      * Read the tracepoint event id for the kprobe we just registered.
> +      */
> +     strcpy(efn, EVENTSFS);
> +     strcat(efn, "kprobes/");
> +     strcat(efn, name);
> +     strcat(efn, "/id");
> +
> +     fd = open(efn, O_RDONLY);
> +     if (fd < 0) {
> +             perror(efn);
> +             return -1;
> +     }
> +     rc = read(fd, buf, sizeof(buf));
> +     if (rc < 0 || rc >= sizeof(buf)) {
> +             perror(efn);
> +             close(fd);
> +             return -1;
> +     }
> +     close(fd);
> +     buf[rc] = '\0';
> +     event_id = atoi(buf);
> +
> +     /*
> +      * Attaching a BPF program (by file descriptor) to an event (by ID) is
> +      * a generic operation provided by the BPF interface code.
> +      */
> +     return dt_bpf_attach(event_id, bpf_fd);
> +}
> +
> +struct dt_provider   dt_fbt = {
> +     .name           = "fbt",
> +     .populate       = &fbt_populate,
> +     .resolve_event  = &fbt_resolve_event,
> +     .attach         = &fbt_attach,
> +};
> diff --git a/tools/dtrace/dt_hash.c b/tools/dtrace/dt_hash.c
> new file mode 100644
> index 000000000000..b1f563bc0773
> --- /dev/null
> +++ b/tools/dtrace/dt_hash.c
> @@ -0,0 +1,211 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * This file provides a generic hashtable implementation for probes.
> + *
> + * The hashtable is created with 4 user-provided functions:
> + *   hval(probe)             - calculate a hash value for the given probe
> + *   cmp(probe1, probe2)     - compare two probes
> + *   add(head, probe)        - add a probe to a list of probes
> + *   del(head, probe)        - delete a probe from a list of probes
> + *
> + * Probes are hashed into a hashtable slot based on the return value of
> + * hval(probe).  Each hashtable slot holds a list of buckets, with each
> + * bucket storing probes that are equal under the cmp(probe1, probe2)
> + * function. Probes are added to the list of probes in a bucket using the
> + * add(head, probe) function, and they are deleted using a call to the
> + * del(head, probe) function.
> + *
> + * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
> + */
> +#include <errno.h>
> +#include <stdint.h>
> +#include <stdlib.h>
> +
> +#include "dtrace_impl.h"
> +
> +/*
> + * Hashtable implementation for probes.
> + */
> +struct dt_hbucket {
> +     u32                     hval;
> +     struct dt_hbucket       *next;
> +     struct dt_probe         *head;
> +     int                     nprobes;
> +};
> +
> +struct dt_htab {
> +     struct dt_hbucket       **tab;
> +     int                     size;
> +     int                     mask;
> +     int                     nbuckets;
> +     dt_hval_fn              hval;           /* calculate hash value */
> +     dt_cmp_fn               cmp;            /* compare 2 probes */
> +     dt_add_fn               add;            /* add probe to list */
> +     dt_del_fn               del;            /* delete probe from list */
> +};
> +
> +/*
> + * Create a new (empty) hashtable.
> + */
> +struct dt_htab *dt_htab_new(dt_hval_fn hval, dt_cmp_fn cmp, dt_add_fn add,
> +                         dt_del_fn del)
> +{
> +     struct dt_htab  *htab = malloc(sizeof(struct dt_htab));
> +
> +     if (!htab)
> +             return NULL;
> +
> +     htab->size = 1;
> +     htab->mask = htab->size - 1;
> +     htab->nbuckets = 0;
> +     htab->hval = hval;
> +     htab->cmp = cmp;
> +     htab->add = add;
> +     htab->del = del;
> +
> +     htab->tab = calloc(htab->size, sizeof(struct dt_hbucket *));
> +     if (!htab->tab) {
> +             free(htab);
> +             return NULL;
> +     }
> +
> +     return htab;
> +}
> +
> +/*
> + * Resize the hashtable by doubling the number of slots.
> + */
> +static int resize(struct dt_htab *htab)
> +{
> +     int                     i;
> +     int                     osize = htab->size;
> +     int                     nsize = osize << 1;
> +     int                     nmask = nsize - 1;
> +     struct dt_hbucket       **ntab;
> +
> +     ntab = calloc(nsize, sizeof(struct dt_hbucket *));
> +     if (!ntab)
> +             return -ENOMEM;
> +
> +     for (i = 0; i < osize; i++) {
> +             struct dt_hbucket       *bucket, *next;
> +
> +             for (bucket = htab->tab[i]; bucket; bucket = next) {
> +                     int     idx     = bucket->hval & nmask;
> +
> +                     next = bucket->next;
> +                     bucket->next = ntab[idx];
> +                     ntab[idx] = bucket;
> +             }
> +     }
> +
> +     free(htab->tab);
> +     htab->tab = ntab;
> +     htab->size = nsize;
> +     htab->mask = nmask;
> +
> +     return 0;
> +}
> +
> +/*
> + * Add a probe to the hashtable.  Resize if necessary, and allocate a new
> + * bucket if necessary.
> + */
> +int dt_htab_add(struct dt_htab *htab, struct dt_probe *probe)
> +{
> +     u32                     hval = htab->hval(probe);
> +     int                     idx;
> +     struct dt_hbucket       *bucket;
> +
> +retry:
> +     idx = hval & htab->mask;
> +     for (bucket = htab->tab[idx]; bucket; bucket = bucket->next) {
> +             if (htab->cmp(bucket->head, probe) == 0)
> +                     goto add;
> +     }
> +
> +     if ((htab->nbuckets >> 1) > htab->size) {
> +             int     err;
> +
> +             err = resize(htab);
> +             if (err)
> +                     return err;
> +
> +             goto retry;
> +     }
> +
> +     bucket = malloc(sizeof(struct dt_hbucket));
> +     if (!bucket)
> +             return -ENOMEM;
> +
> +     bucket->hval = hval;
> +     bucket->next = htab->tab[idx];
> +     bucket->head = NULL;
> +     bucket->nprobes = 0;
> +     htab->tab[idx] = bucket;
> +     htab->nbuckets++;
> +
> +add:
> +     bucket->head = htab->add(bucket->head, probe);
> +     bucket->nprobes++;
> +
> +     return 0;
> +}
> +
> +/*
> + * Find a probe in the hashtable.
> + */
> +struct dt_probe *dt_htab_lookup(const struct dt_htab *htab,
> +                             const struct dt_probe *probe)
> +{
> +     u32                     hval = htab->hval(probe);
> +     int                     idx = hval & htab->mask;
> +     struct dt_hbucket       *bucket;
> +
> +     for (bucket = htab->tab[idx]; bucket; bucket = bucket->next) {
> +             if (htab->cmp(bucket->head, probe) == 0)
> +                     return bucket->head;
> +     }
> +
> +     return NULL;
> +}
> +
> +/*
> + * Remove a probe from the hashtable.  If we are deleting the last probe in a
> + * bucket, get rid of the bucket.
> + */
> +int dt_htab_del(struct dt_htab *htab, struct dt_probe *probe)
> +{
> +     u32                     hval = htab->hval(probe);
> +     int                     idx = hval & htab->mask;
> +     struct dt_hbucket       *bucket;
> +     struct dt_probe         *head;
> +
> +     for (bucket = htab->tab[idx]; bucket; bucket = bucket->next) {
> +             if (htab->cmp(bucket->head, probe) == 0)
> +                     break;
> +     }
> +
> +     if (bucket == NULL)
> +             return -ENOENT;
> +
> +     head = htab->del(bucket->head, probe);
> +     if (!head) {
> +             struct dt_hbucket       *b = htab->tab[idx];
> +
> +             if (bucket == b)
> +                     htab->tab[idx] = bucket->next;
> +             else {
> +                     while (b->next != bucket)
> +                             b = b->next;
> +
> +                     b->next = bucket->next;
> +             }
> +
> +             htab->nbuckets--;
> +             free(bucket);
> +     } else
> +             bucket->head = head;
> +
> +     return 0;
> +}
> diff --git a/tools/dtrace/dt_probe.c b/tools/dtrace/dt_probe.c
> new file mode 100644
> index 000000000000..0b6228eaff29
> --- /dev/null
> +++ b/tools/dtrace/dt_probe.c
> @@ -0,0 +1,230 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * This file implements the interface to probes grouped by provider.
> + *
> + * Probes are named by a set of 4 identifiers:
> + *   - provider name
> + *   - module name
> + *   - function name
> + *   - probe name
> + *
> + * The Fully Qualified Name (FQN) is "provider:module:function:name".
> + *
> + * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
> + */
> +#include <errno.h>
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <linux/bpf.h>
> +#include <linux/kernel.h>
> +
> +#include "dtrace_impl.h"
> +
> +static struct dt_provider      *dt_providers[] = {
> +                                                     &dt_fbt,
> +                                                     &dt_syscall,
> +                                              };
> +
> +static struct dt_htab        *ht_byfqn;
> +
> +static u32           next_probe_id;
> +
> +/*
> + * Calculate a hash value based on a given string and an initial value.  The
> + * initial value is used to calculate compound hash values, e.g.
> + *
> + *   u32     hval;
> + *
> + *   hval = str2hval(str1, 0);
> + *   hval = str2hval(str2, hval);
> + */
> +static u32 str2hval(const char *p, u32 hval)
> +{
> +     u32     g;
> +
> +     if (!p)
> +             return hval;
> +
> +     while (*p) {
> +             hval = (hval << 4) + *p++;
> +             g = hval & 0xf0000000;
> +             if (g != 0)
> +                     hval ^= g >> 24;
> +
> +             hval &= ~g;
> +     }
> +
> +     return hval;
> +}
> +
> +/*
> + * String compare function that can handle either or both strings being NULL.
> + */
> +static int safe_strcmp(const char *p, const char *q)
> +{
> +     return (!p) ? (!q) ? 0
> +                        : -1
> +                 : (!q) ? 1
> +                        : strcmp(p, q);
> +}
> +
> +/*
> + * Calculate the hash value of a probe as the cummulative hash value of the
> + * FQN.
> + */
> +static u32 fqn_hval(const struct dt_probe *probe)
> +{
> +     u32     hval = 0;
> +
> +     hval = str2hval(probe->prv_name, hval);
> +     hval = str2hval(":", hval);
> +     hval = str2hval(probe->mod_name, hval);
> +     hval = str2hval(":", hval);
> +     hval = str2hval(probe->fun_name, hval);
> +     hval = str2hval(":", hval);
> +     hval = str2hval(probe->prb_name, hval);
> +
> +     return hval;
> +}
> +
> +/*
> + * Compare two probes based on the FQN.
> + */
> +static int fqn_cmp(const struct dt_probe *p, const struct dt_probe *q)
> +{
> +     int     rc;
> +
> +     rc = safe_strcmp(p->prv_name, q->prv_name);
> +     if (rc)
> +             return rc;
> +     rc = safe_strcmp(p->mod_name, q->mod_name);
> +     if (rc)
> +             return rc;
> +     rc = safe_strcmp(p->fun_name, q->fun_name);
> +     if (rc)
> +             return rc;
> +     rc = safe_strcmp(p->prb_name, q->prb_name);
> +     if (rc)
> +             return rc;
> +
> +     return 0;
> +}
> +
> +/*
> + * Add the given probe 'new' to the double-linked probe list 'head'.  Probe
> + * 'new' becomes the new list head.
> + */
> +static struct dt_probe *fqn_add(struct dt_probe *head, struct dt_probe *new)
> +{
> +     if (!head)
> +             return new;
> +
> +     new->he_fqn.next = head;
> +     head->he_fqn.prev = new;
> +
> +     return new;
> +}
> +
> +/*
> + * Remove the given probe 'probe' from the double-linked probe list 'head'.
> + * If we are deleting the current head, the next probe in the list is 
> returned
> + * as the new head.  If that value is NULL, the list is now empty.
> + */
> +static struct dt_probe *fqn_del(struct dt_probe *head, struct dt_probe 
> *probe)
> +{
> +     if (head == probe) {
> +             if (!probe->he_fqn.next)
> +                     return NULL;
> +
> +             head = probe->he_fqn.next;
> +             head->he_fqn.prev = NULL;
> +             probe->he_fqn.next = NULL;
> +
> +             return head;
> +     }
> +
> +     if (!probe->he_fqn.next) {
> +             probe->he_fqn.prev->he_fqn.next = NULL;
> +             probe->he_fqn.prev = NULL;
> +
> +             return head;
> +     }
> +
> +     probe->he_fqn.prev->he_fqn.next = probe->he_fqn.next;
> +     probe->he_fqn.next->he_fqn.prev = probe->he_fqn.prev;
> +     probe->he_fqn.prev = probe->he_fqn.next = NULL;
> +
> +     return head;
> +}
> +
> +/*
> + * Initialize the probe handling by populating the FQN hashtable with probes
> + * from all providers.
> + */
> +int dt_probe_init(void)
> +{
> +     int     i;
> +
> +     ht_byfqn = dt_htab_new(fqn_hval, fqn_cmp, fqn_add, fqn_del);
> +
> +     for (i = 0; i < ARRAY_SIZE(dt_providers); i++) {
> +             if (dt_providers[i]->populate() < 0)
> +                     return -1;
> +     }
> +
> +     return 0;
> +}
> +
> +/*
> + * Allocate a new probe and add it to the FQN hashtable.
> + */
> +int dt_probe_new(const struct dt_provider *prov, const char *pname,
> +              const char *mname, const char *fname, const char *name)
> +{
> +     struct dt_probe *probe;
> +
> +     probe = malloc(sizeof(struct dt_probe));
> +     if (!probe)
> +             return -ENOMEM;
> +
> +     memset(probe, 0, sizeof(struct dt_probe));
> +     probe->id = next_probe_id++;
> +     probe->prov = prov;
> +     probe->prv_name = pname ? strdup(pname) : NULL;
> +     probe->mod_name = mname ? strdup(mname) : NULL;
> +     probe->fun_name = fname ? strdup(fname) : NULL;
> +     probe->prb_name = name ? strdup(name) : NULL;
> +
> +     dt_htab_add(ht_byfqn, probe);
> +
> +     return 0;
> +}
> +
> +/*
> + * Perform a probe lookup based on FQN.
> + */
> +struct dt_probe *dt_probe_by_name(const struct dt_probe *tmpl)
> +{
> +     return dt_htab_lookup(ht_byfqn, tmpl);
> +}
> +
> +/*
> + * Resolve an event name (BPF ELF section name) into a probe.  We query each
> + * provider, and as soon as we get a hit, we return the result.
> + */
> +struct dt_probe *dt_probe_resolve_event(const char *name)
> +{
> +     int             i;
> +     struct dt_probe *probe;
> +
> +     for (i = 0; i < ARRAY_SIZE(dt_providers); i++) {
> +             if (!dt_providers[i]->resolve_event)
> +                     continue;
> +             probe = dt_providers[i]->resolve_event(name);
> +             if (probe)
> +                     return probe;
> +     }
> +
> +     return NULL;
> +}
> diff --git a/tools/dtrace/dt_syscall.c b/tools/dtrace/dt_syscall.c
> new file mode 100644
> index 000000000000..6695a4a1c701
> --- /dev/null
> +++ b/tools/dtrace/dt_syscall.c
> @@ -0,0 +1,179 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * The syscall provider for DTrace.
> + *
> + * System call probes are exposed by the kernel as tracepoint events in the
> + * "syscalls" group.  Entry probe names start with "sys_enter_" and exit 
> probes
> + * start with "sys_exit_".
> + *
> + * Mapping from event name to DTrace probe name:
> + *
> + *   syscalls:sys_enter_<name>               syscall:vmlinux:<name>:entry
> + *   syscalls:sys_exit_<name>                syscall:vmlinux:<name>:return
> + *
> + * Mapping from BPF section name to DTrace probe name:
> + *
> + *   tracepoint/syscalls/sys_enter_<name>    syscall:vmlinux:<name>:entry
> + *   tracepoint/syscalls/sys_exit_<name>     syscall:vmlinux:<name>:return
> + *
> + * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
> + */
> +#include <ctype.h>
> +#include <fcntl.h>
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <unistd.h>
> +#include <linux/bpf.h>
> +#include <sys/stat.h>
> +#include <sys/types.h>
> +
> +#include "dtrace_impl.h"
> +
> +static const char    provname[] = "syscall";
> +static const char    modname[] = "vmlinux";
> +
> +#define PROBE_LIST   TRACEFS "available_events"
> +
> +#define PROV_PREFIX  "syscalls:"
> +#define ENTRY_PREFIX "sys_enter_"
> +#define EXIT_PREFIX  "sys_exit_"
> +
> +/*
> + * Scan the PROBE_LIST file and add probes for any syscalls events.
> + */
> +static int syscall_populate(void)
> +{
> +     FILE                    *f;
> +     char                    buf[256];
> +
> +     f = fopen(PROBE_LIST, "r");
> +     if (f == NULL)
> +             return -1;
> +
> +     while (fgets(buf, sizeof(buf), f)) {
> +             char    *p;
> +
> +             /* * Here buf is "group:event".  */
> +             p = strchr(buf, '\n');
> +             if (p)
> +                     *p = '\0';
> +             else {
> +                     /*
> +                      * If we didn't see a newline, the line was too long.
> +                      * Report it, and continue until the end of the line.
> +                      */
> +                     fprintf(stderr, "%s: Line too long: %s\n",
> +                             PROBE_LIST, buf);
> +                     do
> +                             fgets(buf, sizeof(buf), f);
> +                     while (strchr(buf, '\n') == NULL);
> +                     continue;
> +             }
> +
> +             /* We need "group:" to match "syscalls:". */
> +             p = buf;
> +             if (memcmp(p, PROV_PREFIX, sizeof(PROV_PREFIX) - 1) != 0)
> +                     continue;
> +
> +             p += sizeof(PROV_PREFIX) - 1;
> +             /*
> +              * Now p will be just "event", and we are only interested in
> +              * events that match "sys_enter_*" or "sys_exit_*".
> +              */
> +             if (!memcmp(p, ENTRY_PREFIX, sizeof(ENTRY_PREFIX) - 1)) {
> +                     p += sizeof(ENTRY_PREFIX) - 1;
> +                     dt_probe_new(&dt_syscall, provname, modname, p,
> +                                  "entry");
> +             } else if (!memcmp(p, EXIT_PREFIX, sizeof(EXIT_PREFIX) - 1)) {
> +                     p += sizeof(EXIT_PREFIX) - 1;
> +                     dt_probe_new(&dt_syscall, provname, modname, p,
> +                                  "return");
> +             }
> +     }
> +
> +     fclose(f);
> +
> +     return 0;
> +}
> +
> +#define EVENT_PREFIX "tracepoint/syscalls/"
> +
> +/*
> + * Perform a probe lookup based on an event name (BPF ELF section name).
> + */
> +static struct dt_probe *systrace_resolve_event(const char *name)
> +{
> +     const char      *prbname;
> +     struct dt_probe tmpl;
> +     struct dt_probe *probe;
> +
> +     if (!name)
> +             return NULL;
> +
> +     /* Exclude anything that is not a syscalls tracepoint */
> +     if (strncmp(name, EVENT_PREFIX, sizeof(EVENT_PREFIX) - 1) != 0)
> +             return NULL;
> +     name += sizeof(EVENT_PREFIX) - 1;
> +
> +     if (strncmp(name, ENTRY_PREFIX, sizeof(ENTRY_PREFIX) - 1) == 0) {
> +             name += sizeof(ENTRY_PREFIX) - 1;
> +             prbname = "entry";
> +     } else if (strncmp(name, EXIT_PREFIX, sizeof(EXIT_PREFIX) - 1) == 0) {
> +             name += sizeof(EXIT_PREFIX) - 1;
> +             prbname = "return";
> +     } else
> +             return NULL;
> +
> +     memset(&tmpl, 0, sizeof(tmpl));
> +     tmpl.prv_name = provname;
> +     tmpl.mod_name = modname;
> +     tmpl.fun_name = name;
> +     tmpl.prb_name = prbname;
> +
> +     probe = dt_probe_by_name(&tmpl);
> +
> +     return probe;
> +}
> +
> +#define SYSCALLSFS   EVENTSFS "syscalls/"
> +
> +/*
> + * Attach the given BPF program (identified by its file descriptor) to the
> + * event identified by the given section name.
> + */
> +static int syscall_attach(const char *name, int bpf_fd)
> +{
> +     char    efn[256];
> +     char    buf[256];
> +     int     event_id, fd, rc;
> +
> +     name += sizeof(EVENT_PREFIX) - 1;
> +     strcpy(efn, SYSCALLSFS);
> +     strcat(efn, name);
> +     strcat(efn, "/id");
> +
> +     fd = open(efn, O_RDONLY);
> +     if (fd < 0) {
> +             perror(efn);
> +             return -1;
> +     }
> +     rc = read(fd, buf, sizeof(buf));
> +     if (rc < 0 || rc >= sizeof(buf)) {
> +             perror(efn);
> +             close(fd);
> +             return -1;
> +     }
> +     close(fd);
> +     buf[rc] = '\0';
> +     event_id = atoi(buf);
> +
> +     return dt_bpf_attach(event_id, bpf_fd);
> +}
> +
> +struct dt_provider   dt_syscall = {
> +     .name           = "syscall",
> +     .populate       = &syscall_populate,
> +     .resolve_event  = &systrace_resolve_event,
> +     .attach         = &syscall_attach,
> +};
> diff --git a/tools/dtrace/dt_utils.c b/tools/dtrace/dt_utils.c
> new file mode 100644
> index 000000000000..55d51bae1d97
> --- /dev/null
> +++ b/tools/dtrace/dt_utils.c
> @@ -0,0 +1,132 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
> + */
> +#include <sys/types.h>
> +#include <sys/stat.h>
> +#include <fcntl.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <unistd.h>
> +
> +#include "dtrace_impl.h"
> +
> +#define BUF_SIZE     1024            /* max size for online cpu data */
> +
> +int  dt_numcpus;                     /* number of online CPUs */
> +int  dt_maxcpuid;                    /* highest CPU id */
> +int  *dt_cpuids;                     /* list of CPU ids */
> +
> +/*
> + * Populate the online CPU id information from sysfs data.  We only do this
> + * once because we do not care about CPUs coming online after we started
> + * tracing.  If a CPU goes offline during tracing, we do not care either
> + * because that simply means that it won't be writing any new probe data into
> + * its buffer.
> + */
> +void cpu_list_populate(void)
> +{
> +     char buf[BUF_SIZE];
> +     int fd, cnt, start, end, i;
> +     int *cpu;
> +     char *p, *q;
> +
> +     fd = open("/sys/devices/system/cpu/online", O_RDONLY);
> +     if (fd < 0)
> +             goto fail;
> +     cnt = read(fd, buf, sizeof(buf));
> +     close(fd);
> +     if (cnt <= 0)
> +             goto fail;
> +
> +     /*
> +      * The string should always end with a newline, but let's make sure.
> +      */
> +     if (buf[cnt - 1] == '\n')
> +             buf[--cnt] = 0;
> +
> +     /*
> +      * Count how many CPUs we have.
> +      */
> +     dt_numcpus = 0;
> +     p = buf;
> +     do {
> +             start = (int)strtol(p, &q, 10);
> +             switch (*q) {
> +             case '-':               /* range */
> +                     p = q + 1;
> +                     end = (int)strtol(p, &q, 10);
> +                     dt_numcpus += end - start + 1;
> +                     if (*q == 0) {  /* end of string */
> +                             p = q;
> +                             break;
> +                     }
> +                     if (*q != ',')
> +                             goto fail;
> +                     p = q + 1;
> +                     break;
> +             case 0:                 /* end of string */
> +                     dt_numcpus++;
> +                     p = q;
> +                     break;
> +             case ',':       /* gap  */
> +                     dt_numcpus++;
> +                     p = q + 1;
> +                     break;
> +             }
> +     } while (*p != 0);
> +
> +     dt_cpuids = calloc(dt_numcpus,  sizeof(int));
> +     cpu = dt_cpuids;
> +
> +     /*
> +      * Fill in the CPU ids.
> +      */
> +     p = buf;
> +     do {
> +             start = (int)strtol(p, &q, 10);
> +             switch (*q) {
> +             case '-':               /* range */
> +                     p = q + 1;
> +                     end = (int)strtol(p, &q, 10);
> +                     for (i = start; i <= end; i++)
> +                             *cpu++ = i;
> +                     if (*q == 0) {  /* end of string */
> +                             p = q;
> +                             break;
> +                     }
> +                     if (*q != ',')
> +                             goto fail;
> +                     p = q + 1;
> +                     break;
> +             case 0:                 /* end of string */
> +                     *cpu = start;
> +                     p = q;
> +                     break;
> +             case ',':       /* gap  */
> +                     *cpu++ = start;
> +                     p = q + 1;
> +                     break;
> +             }
> +     } while (*p != 0);
> +
> +     /* Record the highest CPU id of the set of online CPUs. */
> +     dt_maxcpuid = *(cpu - 1);
> +
> +     return;
> +fail:
> +     if (dt_cpuids)
> +             free(dt_cpuids);
> +
> +     dt_numcpus = 0;
> +     dt_maxcpuid = 0;
> +     dt_cpuids = NULL;
> +}
> +
> +void cpu_list_free(void)
> +{
> +     free(dt_cpuids);
> +     dt_numcpus = 0;
> +     dt_maxcpuid = 0;
> +     dt_cpuids = NULL;
> +}
> diff --git a/tools/dtrace/dtrace.c b/tools/dtrace/dtrace.c
> new file mode 100644
> index 000000000000..36ad526c1cd4
> --- /dev/null
> +++ b/tools/dtrace/dtrace.c
> @@ -0,0 +1,249 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
> + */
> +#include <errno.h>
> +#include <libgen.h>
> +#include <stdarg.h>
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <unistd.h>
> +#include <linux/log2.h>
> +
> +#include "dtrace_impl.h"
> +
> +#define DTRACE_BUFSIZE       32              /* default buffer size (in 
> pages) */
> +
> +#define DMODE_VERS   0               /* display version information (-V) */
> +#define DMODE_LIST   1               /* list probes (-l) */
> +#define DMODE_EXEC   2               /* compile program and start tracing */
> +
> +#define E_SUCCESS    0
> +#define E_ERROR              1
> +#define E_USAGE              2
> +
> +#define NUM_PAGES(sz)        (((sz) + getpagesize() - 1) / getpagesize())
> +
> +static const char            *dtrace_options = "+b:ls:V";
> +
> +static char                  *g_pname;
> +static int                   g_mode = DMODE_EXEC;
> +
> +static int usage(void)
> +{
> +     fprintf(stderr, "Usage: %s [-lV] [-b bufsz] -s script\n", g_pname);
> +     fprintf(stderr,
> +     "\t-b  set trace buffer size\n"
> +     "\t-l  list probes matching specified criteria\n"
> +     "\t-s  enable or list probes for the specified BPF program\n"
> +     "\t-V  report DTrace API version\n");
> +
> +     return E_USAGE;
> +}
> +
> +static u64 parse_size(const char *arg)
> +{
> +     long long       mul = 1;
> +     long long       neg, val;
> +     size_t          len;
> +     char            *end;
> +
> +     if (!arg)
> +             return -1;
> +
> +     len = strlen(arg);
> +     if (!len)
> +             return -1;
> +
> +     switch (arg[len - 1]) {
> +     case 't':
> +     case 'T':
> +             mul *= 1024;
> +             /* fall-through */
> +     case 'g':
> +     case 'G':
> +             mul *= 1024;
> +             /* fall-through */
> +     case 'm':
> +     case 'M':
> +             mul *= 1024;
> +             /* fall-through */
> +     case 'k':
> +     case 'K':
> +             mul *= 1024;
> +             /* fall-through */
> +     default:
> +             break;
> +     }
> +
> +     neg = strtoll(arg, NULL, 0);
> +     errno = 0;
> +     val = strtoull(arg, &end, 0) * mul;
> +
> +     if ((mul > 1 && end != &arg[len - 1]) || (mul == 1 && *end != '\0') ||
> +         val < 0 || neg < 0 || errno != 0)
> +             return -1;
> +
> +     return val;
> +}
> +
> +int main(int argc, char *argv[])
> +{
> +     int     i;
> +     int     modec = 0;
> +     int     bufsize = DTRACE_BUFSIZE;
> +     int     epoll_fd;
> +     int     cnt;
> +     char    **prgv;
> +     int     prgc;
> +
> +     g_pname = basename(argv[0]);
> +
> +     if (argc == 1)
> +             return usage();
> +
> +     prgc = 0;
> +     prgv = calloc(argc, sizeof(char *));
> +     if (!prgv) {
> +             fprintf(stderr, "failed to allocate memory for arguments: %s\n",
> +                     strerror(errno));
> +             return E_ERROR;
> +     }
> +
> +     argv[0] = g_pname;                      /* argv[0] for getopt errors */
> +
> +     for (optind = 1; optind < argc; optind++) {
> +             int     opt;
> +
> +             while ((opt = getopt(argc, argv, dtrace_options)) != EOF) {
> +                     u64                     val;
> +
> +                     switch (opt) {
> +                     case 'b':
> +                             val = parse_size(optarg);
> +                             if (val < 0) {
> +                                     fprintf(stderr, "invalid: -b %s\n",
> +                                             optarg);
> +                                     return E_ERROR;
> +                             }
> +
> +                             /*
> +                              * Bufsize needs to be a number of pages, and
> +                              * must be a power of 2.  This is required by
> +                              * the perf event buffer code.
> +                              */
> +                             bufsize = roundup_pow_of_two(NUM_PAGES(val));
> +                             if ((u64)bufsize * getpagesize() > val)
> +                                     fprintf(stderr,
> +                                             "bufsize increased to %ld\n",
> +                                             (u64)bufsize * getpagesize());
> +
> +                             break;
> +                     case 'l':
> +                             g_mode = DMODE_LIST;
> +                             modec++;
> +                             break;
> +                     case 's':
> +                             prgv[prgc++] = optarg;
> +                             break;
> +                     case 'V':
> +                             g_mode = DMODE_VERS;
> +                             modec++;
> +                             break;
> +                     default:
> +                             if (strchr(dtrace_options, opt) == NULL)
> +                                     return usage();
> +                     }
> +             }
> +
> +             if (optind < argc) {
> +                     fprintf(stderr, "unknown option '%s'\n", argv[optind]);
> +                     return E_ERROR;
> +             }
> +     }
> +
> +     if (modec > 1) {
> +             fprintf(stderr,
> +                     "only one of [-lV] can be specified at a time\n");
> +             return E_USAGE;
> +     }
> +
> +     /*
> +      * We handle requests for version information first because we do not
> +      * need probe information for it.
> +      */
> +     if (g_mode == DMODE_VERS) {
> +             printf("%s\n"
> +                    "This is DTrace %s\n"
> +                    "dtrace(1) version-control ID: %s\n",
> +                    DT_VERS_STRING, DT_VERSION, DT_GIT_VERSION);
> +
> +             return E_SUCCESS;
> +     }
> +
> +     /* Initialize probes. */
> +     if (dt_probe_init() < 0) {
> +             fprintf(stderr, "failed to initialize probes: %s\n",
> +                     strerror(errno));
> +             return E_ERROR;
> +     }
> +
> +     /*
> +      * We handle requests to list probes next.
> +      */
> +     if (g_mode == DMODE_LIST) {
> +             int     rc = 0;
> +
> +             printf("%5s %10s %17s %33s %s\n",
> +                    "ID", "PROVIDER", "MODULE", "FUNCTION", "NAME");
> +             for (i = 0; i < prgc; i++) {
> +                     rc = dt_bpf_list_probes(prgv[i]);
> +                     if (rc < 0)
> +                             fprintf(stderr, "failed to load %s: %s\n",
> +                                     prgv[i], strerror(errno));
> +             }
> +
> +             return rc ? E_ERROR : E_SUCCESS;
> +     }
> +
> +     if (!prgc) {
> +             fprintf(stderr, "missing BPF program(s)\n");
> +             return E_ERROR;
> +     }
> +
> +     /* Process the BPF program. */
> +     for (i = 0; i < prgc; i++) {
> +             int     err;
> +
> +             err = dt_bpf_load_file(prgv[i]);
> +             if (err) {
> +                     errno = -err;
> +                     fprintf(stderr, "failed to load %s: %s\n",
> +                             prgv[i], strerror(errno));
> +                     return E_ERROR;
> +             }
> +     }
> +
> +     /* Get the list of online CPUs. */
> +     cpu_list_populate();
> +
> +     /* Initialize buffers. */
> +     epoll_fd = dt_buffer_init(bufsize);
> +     if (epoll_fd < 0) {
> +             errno = -epoll_fd;
> +             fprintf(stderr, "failed to allocate buffers: %s\n",
> +                     strerror(errno));
> +             return E_ERROR;
> +     }
> +
> +     /* Process probe data. */
> +     printf("%3s %6s\n", "CPU", "ID");
> +     do {
> +             cnt = dt_buffer_poll(epoll_fd, 100);
> +     } while (cnt >= 0);
> +
> +     dt_buffer_exit(epoll_fd);
> +
> +     return E_SUCCESS;
> +}
> diff --git a/tools/dtrace/dtrace.h b/tools/dtrace/dtrace.h
> new file mode 100644
> index 000000000000..c79398432d17
> --- /dev/null
> +++ b/tools/dtrace/dtrace.h
> @@ -0,0 +1,13 @@
> +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
> +/*
> + * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
> + */
> +#ifndef _UAPI_LINUX_DTRACE_H
> +#define _UAPI_LINUX_DTRACE_H
> +
> +struct dt_bpf_context {
> +     u32             probe_id;
> +     u64             argv[10];
> +};
> +
> +#endif /* _UAPI_LINUX_DTRACE_H */
> diff --git a/tools/dtrace/dtrace_impl.h b/tools/dtrace/dtrace_impl.h
> new file mode 100644
> index 000000000000..9aa51b4c4aee
> --- /dev/null
> +++ b/tools/dtrace/dtrace_impl.h
> @@ -0,0 +1,101 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
> + */
> +#ifndef _DTRACE_H
> +#define _DTRACE_H
> +
> +#include <unistd.h>
> +#include <bpf/libbpf.h>
> +#include <linux/types.h>
> +#include <linux/ptrace.h>
> +#include <linux/perf_event.h>
> +#include <sys/syscall.h>
> +
> +#include "dtrace.h"
> +
> +#define DT_DEBUG
> +
> +#define DT_VERS_STRING       "Oracle D 2.0.0"
> +
> +#define TRACEFS              "/sys/kernel/debug/tracing/"
> +#define EVENTSFS     TRACEFS "events/"
> +
> +extern int   dt_numcpus;
> +extern int   dt_maxcpuid;
> +extern int   *dt_cpuids;
> +
> +extern void cpu_list_populate(void);
> +extern void cpu_list_free(void);
> +
> +struct dt_provider {
> +     char            *name;
> +     int             (*populate)(void);
> +     struct dt_probe *(*resolve_event)(const char *name);
> +     int             (*attach)(const char *name, int bpf_fd);
> +};
> +
> +extern struct dt_provider    dt_fbt;
> +extern struct dt_provider    dt_syscall;
> +
> +struct dt_hentry {
> +     struct dt_probe         *next;
> +     struct dt_probe         *prev;
> +};
> +
> +struct dt_htab;
> +
> +typedef u32 (*dt_hval_fn)(const struct dt_probe *);
> +typedef int (*dt_cmp_fn)(const struct dt_probe *, const struct dt_probe *);
> +typedef struct dt_probe *(*dt_add_fn)(struct dt_probe *, struct dt_probe *);
> +typedef struct dt_probe *(*dt_del_fn)(struct dt_probe *, struct dt_probe *);
> +
> +extern struct dt_htab *dt_htab_new(dt_hval_fn hval, dt_cmp_fn cmp,
> +                                dt_add_fn add, dt_del_fn del);
> +extern int dt_htab_add(struct dt_htab *htab, struct dt_probe *probe);
> +extern struct dt_probe *dt_htab_lookup(const struct dt_htab *htab,
> +                                    const struct dt_probe *probe);
> +extern int dt_htab_del(struct dt_htab *htab, struct dt_probe *probe);
> +
> +struct dt_probe {
> +     u32                             id;
> +     int                             event_fd;
> +     const struct dt_provider        *prov;
> +     const char                      *prv_name;      /* provider name */
> +     const char                      *mod_name;      /* module name */
> +     const char                      *fun_name;      /* function name */
> +     const char                      *prb_name;      /* probe name */
> +     struct dt_hentry                he_fqn;
> +};
> +
> +typedef void (*dt_probe_fn)(const struct dt_probe *probe);
> +
> +extern int dt_probe_init(void);
> +extern int dt_probe_new(const struct dt_provider *prov, const char *pname,
> +                     const char *mname, const char *fname, const char *name);
> +extern struct dt_probe *dt_probe_by_name(const struct dt_probe *tmpl);
> +extern struct dt_probe *dt_probe_resolve_event(const char *name);
> +
> +extern int dt_bpf_list_probes(const char *fn);
> +extern int dt_bpf_load_file(const char *fn);
> +extern int dt_bpf_map_update(int fd, const void *key, const void *val);
> +extern int dt_bpf_attach(int event_id, int bpf_fd);
> +
> +extern int dt_bufmap_fd;
> +
> +extern int dt_buffer_init(int num_pages);
> +extern int dt_buffer_poll(int epoll_fd, int timeout);
> +extern void dt_buffer_exit(int epoll_fd);
> +
> +static inline int perf_event_open(struct perf_event_attr *attr, pid_t pid,
> +                               int cpu, int group_fd, unsigned long flags)
> +{
> +     return syscall(__NR_perf_event_open, attr, pid, cpu, group_fd, flags);
> +}
> +
> +extern inline int bpf(enum bpf_cmd cmd, union bpf_attr *attr)
> +{
> +     return syscall(__NR_bpf, cmd, attr, sizeof(union bpf_attr));
> +}
> +
> +#endif /* _DTRACE_H */
> -- 
> 2.20.1

Reply via email to