On 24/10/2016 07:55, vijay.kil...@gmail.com wrote: > From: Vijaya Kumar K <vijaya.ku...@cavium.com> > > Thunderx pass2 chip requires explicit prefetch > instruction to give prefetch hint. > > To speed up live migration on Thunderx platform, > prefetch instruction is added in zero buffer check > function.The below results show live migration time improvement > with prefetch instruction. VM with 4 VCPUs, 8GB RAM is migrated. > > Without prefetch total migration time is ~13 seconds > adding prefetch total migration time is 9.5 seconds > > Code for decoding cache size is taken from Richard's > patch > > Signed-off-by: Vijaya Kumar K <vijaya.ku...@cavium.com> > --- > util/bufferiszero.c | 37 ++++++++++++++++++++++++++++++++++++- > 1 file changed, 36 insertions(+), 1 deletion(-) > > diff --git a/util/bufferiszero.c b/util/bufferiszero.c > index 421d945..f50b8df 100644 > --- a/util/bufferiszero.c > +++ b/util/bufferiszero.c > @@ -25,6 +25,10 @@ > #include "qemu-common.h" > #include "qemu/cutils.h" > #include "qemu/bswap.h" > +#include <math.h> > + > +static uint32_t cache_line_factor = 1;
Let's express this in bytes, with a default value of 64 (so rename cache_line_factor->cache_line_size). > +static uint32_t prefetch_line_dist = 1; > > static bool > buffer_zero_int(const void *buf, size_t len) > @@ -49,7 +53,8 @@ buffer_zero_int(const void *buf, size_t len) > const uint64_t *e = (uint64_t *)(((uintptr_t)buf + len) & -8); > > for (; p + 8 <= e; p += 8) { > - __builtin_prefetch(p + 8, 0, 0); > + __builtin_prefetch(p + > + (8 * cache_line_factor * prefetch_line_dist), 0, 0); You should precompute cache_line_bytes * prefetch_line_dist / sizeof(uint64_t) in a single variable, prefetch_distance. This saves the effort of loading global variables repeatedly. Then you can do __builtin_prefetch(p + prefetch_distance, 0, 0); > if (t) { > return false; > } > @@ -293,6 +298,30 @@ bool test_buffer_is_zero_next_accel(void) > } > #endif > > +#if defined(__aarch64__) > +#include "qemu/aarch64-cpuid.h" > + > +static void __attribute__((constructor)) aarch64_init_cache_size(void) > +{ > + uint64_t t; > + > + /* Use the DZP block size as a proxy for the cacheline size, > + since the later is not available to userspace. This seems > + to work in practice for existing implementations. */ > + asm("mrs %0, dczid_el0" : "=r"(t)); > + if (pow(2, (t & 0xf)) * 4 >= 128) { > + cache_line_factor = 2; > + } else { > + cache_line_factor = 1; > + } > + > + get_aarch64_cpu_id(); > + if (is_thunderx_pass2_cpu()) { > + prefetch_line_dist = 3; > + } > +} > +#endif > + > /* > * Checks if a buffer is all zeroes > */ > @@ -305,6 +334,12 @@ bool buffer_is_zero(const void *buf, size_t len) > /* Fetch the beginning of the buffer while we select the accelerator. */ > __builtin_prefetch(buf, 0, 0); > > +#if defined(__aarch64__) > + if (is_thunderx_pass2_cpu()) { > + __builtin_prefetch(buf + 16, 0, 0); > + __builtin_prefetch(buf + 32, 0, 0); This should not be ThunderX or aarch64 specific; it should be a loop like prefetch_distance_bytes = prefetch_line_dist * cache_line_size; for (i = 0; i < prefetch_distance_bytes; i += cache_line_size) __builtin_prefetch(buf + i, 0, 0); In the default case, cache_line_bytes == prefetch_distance_bytes (both are 64) and you will get the same behavior as the existing __builtin_prefetch(buf, 0, 0); Thanks, Paolo > + } > +#endif > /* Use an optimized zero check if possible. Note that this also > includes a check for an unrolled loop over 64-bit integers. */ > return select_accel_fn(buf, len); >