Ok so the nonlinear thing was it doing _two_ copies, one the regular __output_copy() on raw->data and second the optional fragment thingy using __output_custom().
Would something like this work instead? It does the nonlinear thing and the custom copy function thing but allows more than 2 fragments and allows each fragment to have a custom copy. It doesn't look obviously more expensive; it has the one ->copy branch extra, but then it doesn't recompute the sizes. --- diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 1fe22032f228..83e2a83e8db3 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -69,9 +69,18 @@ struct perf_callchain_entry_ctx { bool contexts_maxed; }; +typedef unsigned long (*perf_copy_f)(void *dst, const void *src, unsigned long len); + +struct perf_raw_frag { + struct perf_raw_frag *next; + perf_copy_f copy; + void *data; + u32 size; +} __packed; + struct perf_raw_record { + struct perf_raw_frag frag; u32 size; - void *data; }; /* diff --git a/kernel/events/core.c b/kernel/events/core.c index fe8d49a56322..f7ad7d65317d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5617,16 +5617,21 @@ void perf_output_sample(struct perf_output_handle *handle, } if (sample_type & PERF_SAMPLE_RAW) { - if (data->raw) { - u32 raw_size = data->raw->size; - u32 real_size = round_up(raw_size + sizeof(u32), - sizeof(u64)) - sizeof(u32); - u64 zero = 0; - - perf_output_put(handle, real_size); - __output_copy(handle, data->raw->data, raw_size); - if (real_size - raw_size) - __output_copy(handle, &zero, real_size - raw_size); + struct perf_raw_record *raw = data->raw; + + if (raw) { + struct perf_raw_frag *frag = &raw->frag; + + perf_output_put(handle, raw->size); + do { + if (frag->copy) { + __output_custom(handle, frag->copy, + frag->data, frag->size); + } else { + __output_copy(handle, frag->data, frag->size); + } + frag = frag->next; + } while (frag); } else { struct { u32 size; @@ -5751,14 +5756,22 @@ void perf_prepare_sample(struct perf_event_header *header, } if (sample_type & PERF_SAMPLE_RAW) { - int size = sizeof(u32); + struct perf_raw_record *raw = data->raw; + int size = sizeof(u64); - if (data->raw) - size += data->raw->size; - else - size += sizeof(u32); + if (raw) { + struct perf_raw_frag *frag = &raw->frag; - header->size += round_up(size, sizeof(u64)); + size = sizeof(u32); + do { + size += frag->size; + frag = frag->next; + } while (frag) + size = round_up(size, sizeof(u64)); + raw->size = size; + } + + header->size += size; } if (sample_type & PERF_SAMPLE_BRANCH_STACK) {