This patch adds a PERF_COUNT_SW_USERSPACE_EVENT type,
which can be generated by user with PERF_EVENT_IOC_ENTRY
ioctl command, which injects an event of said type into
the perf buffer.

The ioctl takes a pointer to struct perf_event_userspace
as an argument. The structure begins with a 64-bit
integer type value, which determines meaning of the
following content (size/data pair). Type 0 are defined
as zero-terminated strings, other types are defined by
userspace (the perf tool will contain a list of
known values with reference implementation of data
content parsers).

Possible use cases for this feature:

- "perf_printf" like mechanism to add logging messages
  to one's perf session; an example implementation:

        int perf_printf(int perf_fd, const char *fmt, ...)
        {
                struct perf_event_userspace *event;
                int size;
                va_list ap;
                int err;

                va_start(ap, fmt);

                size = vsnprintf(NULL, 0, fmt, ap) + 1;
                event = malloc(sizeof(*event) + size);
                if (!event) {
                        va_end(ap);
                        return -1;
                }

                event->type = 0;
                event->size = size;
                vsnprintf(event->data, size, fmt, ap);

                va_end(ap);

                err = ioctl(perf_fd, PERF_EVENT_IOC_USERSPACE, event);

                free(event);

                return err < 0 ? err : size - 1;
        }

- "perf_printf" used by for perf trace tool,
  where certain traced process' calls are intercepted
  (eg. using LD_PRELOAD) and treated as logging
  requests, with it output redirected into the
  perf buffer

- synchronisation of performance data generated in
  user space with the perf stream coming from the kernel.
  For example, the marker can be inserted by a JIT engine
  after it generated portion of the code, but before the
  code is executed for the first time, allowing the
  post-processor to pick the correct debugging
  information.

- other example is a system profiling tool taking data
  from other sources than just perf, which generates a marker
  at the beginning at at the end of the session
  (also possibly periodically during the session) to
  synchronise kernel timestamps with clock values
  obtained in userspace (gtod or raw_monotonic).

Signed-off-by: Pawel Moll <pawel.m...@arm.com>
---
 include/linux/perf_event.h      |  8 +++++
 include/uapi/linux/perf_event.h | 34 ++++++++++++++++++++-
 kernel/events/core.c            | 68 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 109 insertions(+), 1 deletion(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 28b73b2..d904d31 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -64,6 +64,12 @@ struct perf_raw_record {
        void                            *data;
 };
 
+struct perf_userspace_entry {
+       u32                             type;
+       u32                             size;
+       u8                              data[0];
+};
+
 /*
  * branch stack layout:
  *  nr: number of taken branches stored in entries[]
@@ -604,6 +610,8 @@ struct perf_sample_data {
        u64                             txn;
        /* Raw monotonic timestamp, for userspace time correlation */
        u64                             clock_raw_monotonic;
+       /* Userspace-originating event */
+       struct perf_userspace_entry     *user_entry;
 };
 
 static inline void perf_sample_data_init(struct perf_sample_data *data,
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index e5a75c5..37604ae 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -110,6 +110,7 @@ enum perf_sw_ids {
        PERF_COUNT_SW_ALIGNMENT_FAULTS          = 7,
        PERF_COUNT_SW_EMULATION_FAULTS          = 8,
        PERF_COUNT_SW_DUMMY                     = 9,
+       PERF_COUNT_SW_USERSPACE_EVENT           = 10,
 
        PERF_COUNT_SW_MAX,                      /* non-ABI */
 };
@@ -138,8 +139,9 @@ enum perf_event_sample_format {
        PERF_SAMPLE_IDENTIFIER                  = 1U << 16,
        PERF_SAMPLE_TRANSACTION                 = 1U << 17,
        PERF_SAMPLE_CLOCK_RAW_MONOTONIC         = 1U << 18,
+       PERF_SAMPLE_USERSPACE_EVENT             = 1U << 19,
 
-       PERF_SAMPLE_MAX = 1U << 19,             /* non-ABI */
+       PERF_SAMPLE_MAX = 1U << 20,             /* non-ABI */
 };
 
 /*
@@ -337,6 +339,15 @@ struct perf_event_attr {
        __u32   __reserved_2;
 };
 
+/*
+ * Userspace-originating event to be generated with PERF_EVENT_IOC_USERSPACE
+ */
+struct perf_event_userspace {
+       __u32   type;
+       __u32   size;
+       __u8    data[0];
+};
+
 #define perf_flags(attr)       (*(&(attr)->read_format + 1))
 
 /*
@@ -350,6 +361,8 @@ struct perf_event_attr {
 #define PERF_EVENT_IOC_SET_OUTPUT      _IO ('$', 5)
 #define PERF_EVENT_IOC_SET_FILTER      _IOW('$', 6, char *)
 #define PERF_EVENT_IOC_ID              _IOR('$', 7, __u64 *)
+#define PERF_EVENT_IOC_USERSPACE       _IOR('$', 8, \
+                                               struct perf_event_userspace *)
 
 enum perf_event_ioc_flags {
        PERF_IOC_FLAG_GROUP             = 1U << 0,
@@ -688,6 +701,25 @@ enum perf_event_type {
         *      { u64                   data_src; } && PERF_SAMPLE_DATA_SRC
         *      { u64                   transaction; } && 
PERF_SAMPLE_TRANSACTION
         *      { u64                   clock_raw_monotonic; } && 
PERF_SAMPLE_CLOCK_RAW_MONOTONIC
+        *
+        *      #
+        *      # Contents of USERSPACE_EVENT sample data depend on its type.
+        *      #
+        *      # Type 0 means that the data is a zero-terminated string that
+        *      # can be printf-ed in the normal way.
+        *      #
+        *      # Meaning of other type values depends on the userspace
+        *      # and the perf tool code contains a list of those with
+        *      # reference implementations of parsers.
+        *      #
+        *      # Overall size of the sample (including type and size fields)
+        *      # is always aligned to 8 bytes by adding padding after
+        *      # the data.
+        *      #
+        *      { u32                   type;
+        *        u32                   size;
+        *        char                  data[size];
+        *        char                  __padding[] } && 
PERF_SAMPLE_USERSPACE_EVENT
         * };
         */
        PERF_RECORD_SAMPLE                      = 9,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f6df547..11bf1be 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3655,6 +3655,8 @@ static inline int perf_fget_light(int fd, struct fd *p)
 static int perf_event_set_output(struct perf_event *event,
                                 struct perf_event *output_event);
 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
+static int perf_sw_userspace_entry(struct perf_event *event,
+              struct perf_event_userspace __user *arg);
 
 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
@@ -3709,6 +3711,10 @@ static long perf_ioctl(struct file *file, unsigned int 
cmd, unsigned long arg)
        case PERF_EVENT_IOC_SET_FILTER:
                return perf_event_set_filter(event, (void __user *)arg);
 
+       case PERF_EVENT_IOC_USERSPACE:
+               return perf_sw_userspace_entry(event,
+                               (struct perf_event_userspace __user *)arg);
+
        default:
                return -ENOTTY;
        }
@@ -3728,6 +3734,7 @@ static long perf_compat_ioctl(struct file *file, unsigned 
int cmd,
        switch (_IOC_NR(cmd)) {
        case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
        case _IOC_NR(PERF_EVENT_IOC_ID):
+       case _IOC_NR(PERF_EVENT_IOC_USERSPACE):
                /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
                if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
                        cmd &= ~IOCSIZE_MASK;
@@ -4727,6 +4734,16 @@ void perf_output_sample(struct perf_output_handle 
*handle,
        if (sample_type & PERF_SAMPLE_CLOCK_RAW_MONOTONIC)
                perf_output_put(handle, data->clock_raw_monotonic);
 
+       if (sample_type & PERF_SAMPLE_USERSPACE_EVENT) {
+               int size = data->user_entry->size;
+               int padding = ALIGN(size, sizeof(u64)) - size;
+
+               perf_output_put(handle, data->user_entry->type);
+               perf_output_put(handle, size);
+               __output_copy(handle, data->user_entry->data, size);
+               perf_output_skip(handle, padding);
+       };
+
        if (!event->attr.watermark) {
                int wakeup_events = event->attr.wakeup_events;
 
@@ -4834,6 +4851,24 @@ void perf_prepare_sample(struct perf_event_header 
*header,
                data->stack_user_size = stack_size;
                header->size += size;
        }
+
+       if (sample_type & PERF_SAMPLE_USERSPACE_EVENT) {
+               int size = data->user_entry->size;
+
+               /*
+                * Type 0 means zero-terminated string;
+                * make sure it is terminated
+                */
+               if (!data->user_entry->type)
+                       data->user_entry->data[size - 1] = '\0';
+
+               /*
+                * The sample consist of 'type' and 'size' u32 fields
+                * followed with data and padding aligning it to 8 bytes.
+                */
+               header->size += sizeof(u32) + sizeof(u32) +
+                               ALIGN(size, sizeof(u64));
+       }
 }
 
 static void perf_event_output(struct perf_event *event,
@@ -5961,6 +5996,39 @@ static struct pmu perf_swevent = {
        .event_idx      = perf_swevent_event_idx,
 };
 
+static int perf_sw_userspace_entry(struct perf_event *event,
+              struct perf_event_userspace __user *arg)
+{
+       u32 size;
+       struct perf_sample_data data;
+       struct pt_regs *regs = current_pt_regs();
+       struct perf_userspace_entry *entry;
+
+       if (!arg)
+               return -EINVAL;
+
+       if (!static_key_false(&perf_swevent_enabled[
+                               PERF_COUNT_SW_USERSPACE_EVENT]))
+               return 0;
+
+       BUILD_BUG_ON(sizeof(size) != sizeof(arg->size));
+       if (copy_from_user(&size, &arg->size, sizeof(size)) != 0)
+               return -EFAULT;
+
+       BUILD_BUG_ON(sizeof(*arg) != sizeof(*entry));
+       entry = memdup_user(arg, sizeof(*arg) + size);
+       if (IS_ERR(entry))
+               return PTR_ERR(entry);
+
+       perf_sample_data_init(&data, 0, 0);
+       data.user_entry = entry;
+       perf_event_output(event, &data, regs);
+
+       kfree(entry);
+
+       return 0;
+}
+
 #ifdef CONFIG_EVENT_TRACING
 
 static int perf_tp_filter_match(struct perf_event *event,
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to