Make buffer_is_zero a 'static inline' function that tests up to three bytes from the buffer before handing off to an unrolled loop. This eliminates call overhead for most non-zero buffers, and allows to optimize out length checks when it is known at compile time (which is often the case in Qemu).
Signed-off-by: Alexander Monakov <amona...@ispras.ru> Signed-off-by: Mikhail Romanov <mmroma...@ispras.ru> --- include/qemu/cutils.h | 28 +++++++++++++++- util/bufferiszero.c | 76 ++++++++++++------------------------------- 2 files changed, 47 insertions(+), 57 deletions(-) diff --git a/include/qemu/cutils.h b/include/qemu/cutils.h index 92c927a6a3..62b153e603 100644 --- a/include/qemu/cutils.h +++ b/include/qemu/cutils.h @@ -187,9 +187,35 @@ char *freq_to_str(uint64_t freq_hz); /* used to print char* safely */ #define STR_OR_NULL(str) ((str) ? (str) : "null") -bool buffer_is_zero(const void *buf, size_t len); +bool buffer_is_zero_len_4_plus(const void *, size_t); +extern bool (*buffer_is_zero_len_256_plus)(const void *, size_t); bool test_buffer_is_zero_next_accel(void); +/* + * Check if a buffer is all zeroes. + */ +static inline bool buffer_is_zero(const void *vbuf, size_t len) +{ + const char *buf = vbuf; + + if (len == 0) { + return true; + } + if (buf[0] || buf[len - 1] || buf[len / 2]) { + return false; + } + /* All bytes are covered for any len <= 3. */ + if (len <= 3) { + return true; + } + + if (len >= 256) { + return buffer_is_zero_len_256_plus(vbuf, len); + } else { + return buffer_is_zero_len_4_plus(vbuf, len); + } +} + /* * Implementation of ULEB128 (http://en.wikipedia.org/wiki/LEB128) * Input is limited to 14-bit numbers diff --git a/util/bufferiszero.c b/util/bufferiszero.c index f5a3634f9a..01050694a6 100644 --- a/util/bufferiszero.c +++ b/util/bufferiszero.c @@ -26,8 +26,8 @@ #include "qemu/bswap.h" #include "host/cpuinfo.h" -static bool -buffer_zero_int(const void *buf, size_t len) +bool +buffer_is_zero_len_4_plus(const void *buf, size_t len) { if (unlikely(len < 8)) { /* For a very small buffer, simply accumulate all the bytes. */ @@ -157,57 +157,40 @@ buffer_zero_avx512(const void *buf, size_t len) } #endif /* CONFIG_AVX512F_OPT */ -/* - * Make sure that these variables are appropriately initialized when - * SSE2 is enabled on the compiler command-line, but the compiler is - * too old to support CONFIG_AVX2_OPT. - */ -#if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT) -# define INIT_USED 0 -# define INIT_LENGTH 0 -# define INIT_ACCEL buffer_zero_int -#else -# ifndef __SSE2__ -# error "ISA selection confusion" -# endif -# define INIT_USED CPUINFO_SSE2 -# define INIT_LENGTH 64 -# define INIT_ACCEL buffer_zero_sse2 -#endif - -static unsigned used_accel = INIT_USED; -static unsigned length_to_accel = INIT_LENGTH; -static bool (*buffer_accel)(const void *, size_t) = INIT_ACCEL; - static unsigned __attribute__((noinline)) select_accel_cpuinfo(unsigned info) { /* Array is sorted in order of algorithm preference. */ static const struct { unsigned bit; - unsigned len; bool (*fn)(const void *, size_t); } all[] = { #ifdef CONFIG_AVX512F_OPT - { CPUINFO_AVX512F, 256, buffer_zero_avx512 }, + { CPUINFO_AVX512F, buffer_zero_avx512 }, #endif #ifdef CONFIG_AVX2_OPT - { CPUINFO_AVX2, 128, buffer_zero_avx2 }, + { CPUINFO_AVX2, buffer_zero_avx2 }, #endif - { CPUINFO_SSE2, 64, buffer_zero_sse2 }, - { CPUINFO_ALWAYS, 0, buffer_zero_int }, + { CPUINFO_SSE2, buffer_zero_sse2 }, + { CPUINFO_ALWAYS, buffer_is_zero_len_4_plus }, }; for (unsigned i = 0; i < ARRAY_SIZE(all); ++i) { if (info & all[i].bit) { - length_to_accel = all[i].len; - buffer_accel = all[i].fn; + buffer_is_zero_len_256_plus = all[i].fn; return all[i].bit; } } return 0; } +static unsigned used_accel +#if defined(__SSE2__) + = CPUINFO_SSE2; +#else + = 0; +#endif + #if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT) static void __attribute__((constructor)) init_accel(void) { @@ -227,35 +210,16 @@ bool test_buffer_is_zero_next_accel(void) return used; } -static bool select_accel_fn(const void *buf, size_t len) -{ - if (likely(len >= length_to_accel)) { - return buffer_accel(buf, len); - } - return buffer_zero_int(buf, len); -} - #else -#define select_accel_fn buffer_zero_int bool test_buffer_is_zero_next_accel(void) { return false; } #endif -/* - * Checks if a buffer is all zeroes - */ -bool buffer_is_zero(const void *buf, size_t len) -{ - if (unlikely(len == 0)) { - return true; - } - - /* Fetch the beginning of the buffer while we select the accelerator. */ - __builtin_prefetch(buf); - - /* Use an optimized zero check if possible. Note that this also - includes a check for an unrolled loop over 64-bit integers. */ - return select_accel_fn(buf, len); -} +bool (*buffer_is_zero_len_256_plus)(const void *, size_t) +#if defined(__SSE2__) + = buffer_zero_sse2; +#else + = buffer_is_zero_len_4_plus; +#endif -- 2.32.0