There's no real knowledge of the cacheline size, just prefetching one loop ahead.
Signed-off-by: Richard Henderson <r...@twiddle.net> --- util/cutils.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/util/cutils.c b/util/cutils.c index 4d2edd6..0f1ce1d 100644 --- a/util/cutils.c +++ b/util/cutils.c @@ -173,6 +173,8 @@ NAME(const void *buf, size_t len) \ do { \ const VECTYPE *p = buf; \ VECTYPE t; \ + __builtin_prefetch(buf + SIZE); \ + barrier(); \ if (SIZE == sizeof(VECTYPE) * 4) { \ t = (p[0] | p[1]) | (p[2] | p[3]); \ } else if (SIZE == sizeof(VECTYPE) * 8) { \ @@ -376,6 +378,9 @@ bool buffer_is_zero(const void *buf, size_t len) return true; } + /* Fetch the beginning of the buffer while we select the accelerator. */ + __builtin_prefetch(buf); + /* Use an optimized zero check if possible. Note that this also includes a check for an unrolled loop over longs, as well as the unsized, unaligned fallback to buffer_zero_base. */ -- 2.7.4