Use of prefetching in bufferiszero.c is quite questionable: - prefetches are issued just a few CPU cycles before the corresponding line would be hit by demand loads;
- they are done for simple access patterns, i.e. where hardware prefetchers can perform better; - they compete for load ports in loops that should be limited by load port throughput rather than ALU throughput. Signed-off-by: Alexander Monakov <amona...@ispras.ru> Signed-off-by: Mikhail Romanov <mmroma...@ispras.ru> --- util/bufferiszero.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/util/bufferiszero.c b/util/bufferiszero.c index c037d11d04..cb3eb2543f 100644 --- a/util/bufferiszero.c +++ b/util/bufferiszero.c @@ -49,7 +49,6 @@ buffer_is_zero_len_4_plus(const void *buf, size_t len) const uint64_t *e = (uint64_t *)(((uintptr_t)buf + len) & -8); for (; p + 8 <= e; p += 8) { - __builtin_prefetch(p + 8); if (t) { return false; } @@ -79,7 +78,6 @@ buffer_zero_sse2(const void *buf, size_t len) /* Loop over 16-byte aligned blocks of 64. */ while (likely(p <= e)) { - __builtin_prefetch(p); t = _mm_cmpeq_epi8(t, zero); if (unlikely(_mm_movemask_epi8(t) != 0xFFFF)) { return false; @@ -110,7 +108,6 @@ buffer_zero_avx2(const void *buf, size_t len) /* Loop over 32-byte aligned blocks of 128. */ while (p <= e) { - __builtin_prefetch(p); if (unlikely(!_mm256_testz_si256(t, t))) { return false; } -- 2.32.0