On 08/06/16 21:55, Emilio G. Cota wrote: > Sometimes it is useful to have a quick histogram to represent a certain > distribution -- for example, when investigating a performance regression > in a hash table due to inadequate hashing. > > The appended allows us to easily represent a distribution using Unicode > characters. Further, the data structure keeping track of the distribution > is so simple that obtaining its values for off-line processing is trivial. > > Example, taking the last 10 commits to QEMU: > > Characters in commit title Count > ----------------------------------- > 39 1 > 48 1 > 53 1 > 54 2 > 57 1 > 61 1 > 67 1 > 78 1 > 80 1 > qdist_init(&dist); > qdist_inc(&dist, 39); > [...] > qdist_inc(&dist, 80); > > char *str = qdist_pr(&dist, 9, QDIST_PR_LABELS); > // -> [39.0,43.6)▂▂ █▂ ▂ ▄[75.4,80.0] > g_free(str); > > char *str = qdist_pr(&dist, 4, QDIST_PR_LABELS); > // -> [39.0,49.2)▁█▁▁[69.8,80.0] > g_free(str); > > Reviewed-by: Richard Henderson <r...@twiddle.net> > Signed-off-by: Emilio G. Cota <c...@braap.org>
Reviewed-by: Sergey Fedorov <sergey.fedo...@linaro.org> > --- > include/qemu/qdist.h | 63 ++++++++ > util/Makefile.objs | 1 + > util/qdist.c | 395 > +++++++++++++++++++++++++++++++++++++++++++++++++++ > 3 files changed, 459 insertions(+) > create mode 100644 include/qemu/qdist.h > create mode 100644 util/qdist.c > > diff --git a/include/qemu/qdist.h b/include/qemu/qdist.h > new file mode 100644 > index 0000000..f30050c > --- /dev/null > +++ b/include/qemu/qdist.h > @@ -0,0 +1,63 @@ > +/* > + * Copyright (C) 2016, Emilio G. Cota <c...@braap.org> > + * > + * License: GNU GPL, version 2 or later. > + * See the COPYING file in the top-level directory. > + */ > +#ifndef QEMU_QDIST_H > +#define QEMU_QDIST_H > + > +#include "qemu/osdep.h" > +#include "qemu-common.h" > +#include "qemu/bitops.h" > + > +/* > + * Samples with the same 'x value' end up in the same qdist_entry, > + * e.g. inc(0.1) and inc(0.1) end up as {x=0.1, count=2}. > + * > + * Binning happens only at print time, so that we retain the flexibility to > + * choose the binning. This might not be ideal for workloads that do not care > + * much about precision and insert many samples all with different x values; > + * in that case, pre-binning (e.g. entering both 0.115 and 0.097 as 0.1) > + * should be considered. > + */ > +struct qdist_entry { > + double x; > + unsigned long count; > +}; > + > +struct qdist { > + struct qdist_entry *entries; > + size_t n; > + size_t size; > +}; > + > +#define QDIST_PR_BORDER BIT(0) > +#define QDIST_PR_LABELS BIT(1) > +/* the remaining options only work if PR_LABELS is set */ > +#define QDIST_PR_NODECIMAL BIT(2) > +#define QDIST_PR_PERCENT BIT(3) > +#define QDIST_PR_100X BIT(4) > +#define QDIST_PR_NOBINRANGE BIT(5) > + > +void qdist_init(struct qdist *dist); > +void qdist_destroy(struct qdist *dist); > + > +void qdist_add(struct qdist *dist, double x, long count); > +void qdist_inc(struct qdist *dist, double x); > +double qdist_xmin(const struct qdist *dist); > +double qdist_xmax(const struct qdist *dist); > +double qdist_avg(const struct qdist *dist); > +unsigned long qdist_sample_count(const struct qdist *dist); > +size_t qdist_unique_entries(const struct qdist *dist); > + > +/* callers must free the returned string with g_free() */ > +char *qdist_pr_plain(const struct qdist *dist, size_t n_groups); > + > +/* callers must free the returned string with g_free() */ > +char *qdist_pr(const struct qdist *dist, size_t n_groups, uint32_t opt); > + > +/* Only qdist code and test code should ever call this function */ > +void qdist_bin__internal(struct qdist *to, const struct qdist *from, size_t > n); > + > +#endif /* QEMU_QDIST_H */ > diff --git a/util/Makefile.objs b/util/Makefile.objs > index a8a777e..702435e 100644 > --- a/util/Makefile.objs > +++ b/util/Makefile.objs > @@ -32,3 +32,4 @@ util-obj-y += buffer.o > util-obj-y += timed-average.o > util-obj-y += base64.o > util-obj-y += log.o > +util-obj-y += qdist.o > diff --git a/util/qdist.c b/util/qdist.c > new file mode 100644 > index 0000000..4ea2e34 > --- /dev/null > +++ b/util/qdist.c > @@ -0,0 +1,395 @@ > +/* > + * qdist.c - QEMU helpers for handling frequency distributions of data. > + * > + * Copyright (C) 2016, Emilio G. Cota <c...@braap.org> > + * > + * License: GNU GPL, version 2 or later. > + * See the COPYING file in the top-level directory. > + */ > +#include "qemu/qdist.h" > + > +#include <math.h> > +#ifndef NAN > +#define NAN (0.0 / 0.0) > +#endif > + > +void qdist_init(struct qdist *dist) > +{ > + dist->entries = g_malloc(sizeof(*dist->entries)); > + dist->size = 1; > + dist->n = 0; > +} > + > +void qdist_destroy(struct qdist *dist) > +{ > + g_free(dist->entries); > +} > + > +static inline int qdist_cmp_double(double a, double b) > +{ > + if (a > b) { > + return 1; > + } else if (a < b) { > + return -1; > + } > + return 0; > +} > + > +static int qdist_cmp(const void *ap, const void *bp) > +{ > + const struct qdist_entry *a = ap; > + const struct qdist_entry *b = bp; > + > + return qdist_cmp_double(a->x, b->x); > +} > + > +void qdist_add(struct qdist *dist, double x, long count) > +{ > + struct qdist_entry *entry = NULL; > + > + if (dist->n) { > + struct qdist_entry e; > + > + e.x = x; > + entry = bsearch(&e, dist->entries, dist->n, sizeof(e), qdist_cmp); > + } > + > + if (entry) { > + entry->count += count; > + return; > + } > + > + if (unlikely(dist->n == dist->size)) { > + dist->size *= 2; > + dist->entries = g_realloc(dist->entries, > + sizeof(*dist->entries) * (dist->size)); > + } > + dist->n++; > + entry = &dist->entries[dist->n - 1]; > + entry->x = x; > + entry->count = count; > + qsort(dist->entries, dist->n, sizeof(*entry), qdist_cmp); > +} > + > +void qdist_inc(struct qdist *dist, double x) > +{ > + qdist_add(dist, x, 1); > +} > + > +/* > + * Unicode for block elements. See: > + * https://en.wikipedia.org/wiki/Block_Elements > + */ > +static const gunichar qdist_blocks[] = { > + 0x2581, > + 0x2582, > + 0x2583, > + 0x2584, > + 0x2585, > + 0x2586, > + 0x2587, > + 0x2588 > +}; > + > +#define QDIST_NR_BLOCK_CODES ARRAY_SIZE(qdist_blocks) > + > +/* > + * Print a distribution into a string. > + * > + * This function assumes that appropriate binning has been done on the input; > + * see qdist_bin__internal() and qdist_pr_plain(). > + * > + * Callers must free the returned string with g_free(). > + */ > +static char *qdist_pr_internal(const struct qdist *dist) > +{ > + double min, max; > + GString *s = g_string_new(""); > + size_t i; > + > + /* if only one entry, its printout will be either full or empty */ > + if (dist->n == 1) { > + if (dist->entries[0].count) { > + g_string_append_unichar(s, qdist_blocks[QDIST_NR_BLOCK_CODES - > 1]); > + } else { > + g_string_append_c(s, ' '); > + } > + goto out; > + } > + > + /* get min and max counts */ > + min = dist->entries[0].count; > + max = min; > + for (i = 0; i < dist->n; i++) { > + struct qdist_entry *e = &dist->entries[i]; > + > + if (e->count < min) { > + min = e->count; > + } > + if (e->count > max) { > + max = e->count; > + } > + } > + > + for (i = 0; i < dist->n; i++) { > + struct qdist_entry *e = &dist->entries[i]; > + int index; > + > + /* make an exception with 0; instead of using block[0], print a > space */ > + if (e->count) { > + /* divide first to avoid loss of precision when e->count == max > */ > + index = (e->count - min) / (max - min) * (QDIST_NR_BLOCK_CODES - > 1); > + g_string_append_unichar(s, qdist_blocks[index]); > + } else { > + g_string_append_c(s, ' '); > + } > + } > + out: > + return g_string_free(s, FALSE); > +} > + > +/* > + * Bin the distribution in @from into @n bins of consecutive, non-overlapping > + * intervals, copying the result to @to. > + * > + * This function is internal to qdist: only this file and test code should > + * ever call it. > + * > + * Note: calling this function on an already-binned qdist is a bug. > + * > + * If @n == 0 or @from->n == 1, use @from->n. > + */ > +void qdist_bin__internal(struct qdist *to, const struct qdist *from, size_t > n) > +{ > + double xmin, xmax; > + double step; > + size_t i, j; > + > + qdist_init(to); > + > + if (from->n == 0) { > + return; > + } > + if (n == 0 || from->n == 1) { > + n = from->n; > + } > + > + /* set equally-sized bins between @from's left and right */ > + xmin = qdist_xmin(from); > + xmax = qdist_xmax(from); > + step = (xmax - xmin) / n; > + > + if (n == from->n) { > + /* if @from's entries are equally spaced, no need to re-bin */ > + for (i = 0; i < from->n; i++) { > + if (from->entries[i].x != xmin + i * step) { > + goto rebin; > + } > + } > + /* they're equally spaced, so copy the dist and bail out */ > + to->entries = g_new(struct qdist_entry, from->n); > + to->n = from->n; > + memcpy(to->entries, from->entries, sizeof(*to->entries) * to->n); > + return; > + } > + > + rebin: > + j = 0; > + for (i = 0; i < n; i++) { > + double x; > + double left, right; > + > + left = xmin + i * step; > + right = xmin + (i + 1) * step; > + > + /* Add x, even if it might not get any counts later */ > + x = left; > + qdist_add(to, x, 0); > + > + /* > + * To avoid double-counting we capture [left, right) ranges, except > for > + * the righmost bin, which captures a [left, right] range. > + */ > + while (j < from->n && (from->entries[j].x < right || i == n - 1)) { > + struct qdist_entry *o = &from->entries[j]; > + > + qdist_add(to, x, o->count); > + j++; > + } > + } > +} > + > +/* > + * Print @dist into a string, after re-binning it into @n bins of > consecutive, > + * non-overlapping intervals. > + * > + * If @n == 0, use @orig->n. > + * > + * Callers must free the returned string with g_free(). > + */ > +char *qdist_pr_plain(const struct qdist *dist, size_t n) > +{ > + struct qdist binned; > + char *ret; > + > + if (dist->n == 0) { > + return NULL; > + } > + qdist_bin__internal(&binned, dist, n); > + ret = qdist_pr_internal(&binned); > + qdist_destroy(&binned); > + return ret; > +} > + > +static char *qdist_pr_label(const struct qdist *dist, size_t n_bins, > + uint32_t opt, bool is_left) > +{ > + const char *percent; > + const char *lparen; > + const char *rparen; > + GString *s; > + double x1, x2, step; > + double x; > + double n; > + int dec; > + > + s = g_string_new(""); > + if (!(opt & QDIST_PR_LABELS)) { > + goto out; > + } > + > + dec = opt & QDIST_PR_NODECIMAL ? 0 : 1; > + percent = opt & QDIST_PR_PERCENT ? "%" : ""; > + > + n = n_bins ? n_bins : dist->n; > + x = is_left ? qdist_xmin(dist) : qdist_xmax(dist); > + step = (qdist_xmax(dist) - qdist_xmin(dist)) / n; > + > + if (opt & QDIST_PR_100X) { > + x *= 100.0; > + step *= 100.0; > + } > + if (opt & QDIST_PR_NOBINRANGE) { > + lparen = rparen = ""; > + x1 = x; > + x2 = x; /* unnecessary, but a dumb compiler might not figure it out > */ > + } else { > + lparen = "["; > + rparen = is_left ? ")" : "]"; > + if (is_left) { > + x1 = x; > + x2 = x + step; > + } else { > + x1 = x - step; > + x2 = x; > + } > + } > + g_string_append_printf(s, "%s%.*f", lparen, dec, x1); > + if (!(opt & QDIST_PR_NOBINRANGE)) { > + g_string_append_printf(s, ",%.*f%s", dec, x2, rparen); > + } > + g_string_append(s, percent); > + out: > + return g_string_free(s, FALSE); > +} > + > +/* > + * Print the distribution's histogram into a string. > + * > + * See also: qdist_pr_plain(). > + * > + * Callers must free the returned string with g_free(). > + */ > +char *qdist_pr(const struct qdist *dist, size_t n_bins, uint32_t opt) > +{ > + const char *border = opt & QDIST_PR_BORDER ? "|" : ""; > + char *llabel, *rlabel; > + char *hgram; > + GString *s; > + > + if (dist->n == 0) { > + return NULL; > + } > + > + s = g_string_new(""); > + > + llabel = qdist_pr_label(dist, n_bins, opt, true); > + rlabel = qdist_pr_label(dist, n_bins, opt, false); > + hgram = qdist_pr_plain(dist, n_bins); > + g_string_append_printf(s, "%s%s%s%s%s", > + llabel, border, hgram, border, rlabel); > + g_free(llabel); > + g_free(rlabel); > + g_free(hgram); > + > + return g_string_free(s, FALSE); > +} > + > +static inline double qdist_x(const struct qdist *dist, int index) > +{ > + if (dist->n == 0) { > + return NAN; > + } > + return dist->entries[index].x; > +} > + > +double qdist_xmin(const struct qdist *dist) > +{ > + return qdist_x(dist, 0); > +} > + > +double qdist_xmax(const struct qdist *dist) > +{ > + return qdist_x(dist, dist->n - 1); > +} > + > +size_t qdist_unique_entries(const struct qdist *dist) > +{ > + return dist->n; > +} > + > +unsigned long qdist_sample_count(const struct qdist *dist) > +{ > + unsigned long count = 0; > + size_t i; > + > + for (i = 0; i < dist->n; i++) { > + struct qdist_entry *e = &dist->entries[i]; > + > + count += e->count; > + } > + return count; > +} > + > +static double qdist_pairwise_avg(const struct qdist *dist, size_t index, > + size_t n, unsigned long count) > +{ > + /* amortize the recursion by using a base case > 2 */ > + if (n <= 8) { > + size_t i; > + double ret = 0; > + > + for (i = 0; i < n; i++) { > + struct qdist_entry *e = &dist->entries[index + i]; > + > + ret += e->x * e->count / count; > + } > + return ret; > + } else { > + size_t n2 = n / 2; > + > + return qdist_pairwise_avg(dist, index, n2, count) + > + qdist_pairwise_avg(dist, index + n2, n - n2, count); > + } > +} > + > +double qdist_avg(const struct qdist *dist) > +{ > + unsigned long count; > + > + count = qdist_sample_count(dist); > + if (!count) { > + return NAN; > + } > + return qdist_pairwise_avg(dist, 0, dist->n, count); > +}