Make buffer_is_zero a 'static inline' function that tests up to three
bytes from the buffer before handing off to an unrolled loop. This
eliminates call overhead for most non-zero buffers, and allows to
optimize out length checks when it is known at compile time (which is
often the case in Qemu).

Signed-off-by: Alexander Monakov <amona...@ispras.ru>
Signed-off-by: Mikhail Romanov <mmroma...@ispras.ru>
---
 include/qemu/cutils.h | 28 +++++++++++++++-
 util/bufferiszero.c   | 76 ++++++++++++-------------------------------
 2 files changed, 47 insertions(+), 57 deletions(-)

diff --git a/include/qemu/cutils.h b/include/qemu/cutils.h
index 92c927a6a3..62b153e603 100644
--- a/include/qemu/cutils.h
+++ b/include/qemu/cutils.h
@@ -187,9 +187,35 @@ char *freq_to_str(uint64_t freq_hz);
 /* used to print char* safely */
 #define STR_OR_NULL(str) ((str) ? (str) : "null")
 
-bool buffer_is_zero(const void *buf, size_t len);
+bool buffer_is_zero_len_4_plus(const void *, size_t);
+extern bool (*buffer_is_zero_len_256_plus)(const void *, size_t);
 bool test_buffer_is_zero_next_accel(void);
 
+/*
+ * Check if a buffer is all zeroes.
+ */
+static inline bool buffer_is_zero(const void *vbuf, size_t len)
+{
+    const char *buf = vbuf;
+
+    if (len == 0) {
+        return true;
+    }
+    if (buf[0] || buf[len - 1] || buf[len / 2]) {
+        return false;
+    }
+    /* All bytes are covered for any len <= 3.  */
+    if (len <= 3) {
+        return true;
+    }
+
+    if (len >= 256) {
+        return buffer_is_zero_len_256_plus(vbuf, len);
+    } else {
+        return buffer_is_zero_len_4_plus(vbuf, len);
+    }
+}
+
 /*
  * Implementation of ULEB128 (http://en.wikipedia.org/wiki/LEB128)
  * Input is limited to 14-bit numbers
diff --git a/util/bufferiszero.c b/util/bufferiszero.c
index f5a3634f9a..01050694a6 100644
--- a/util/bufferiszero.c
+++ b/util/bufferiszero.c
@@ -26,8 +26,8 @@
 #include "qemu/bswap.h"
 #include "host/cpuinfo.h"
 
-static bool
-buffer_zero_int(const void *buf, size_t len)
+bool
+buffer_is_zero_len_4_plus(const void *buf, size_t len)
 {
     if (unlikely(len < 8)) {
         /* For a very small buffer, simply accumulate all the bytes.  */
@@ -157,57 +157,40 @@ buffer_zero_avx512(const void *buf, size_t len)
 }
 #endif /* CONFIG_AVX512F_OPT */
 
-/*
- * Make sure that these variables are appropriately initialized when
- * SSE2 is enabled on the compiler command-line, but the compiler is
- * too old to support CONFIG_AVX2_OPT.
- */
-#if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT)
-# define INIT_USED     0
-# define INIT_LENGTH   0
-# define INIT_ACCEL    buffer_zero_int
-#else
-# ifndef __SSE2__
-#  error "ISA selection confusion"
-# endif
-# define INIT_USED     CPUINFO_SSE2
-# define INIT_LENGTH   64
-# define INIT_ACCEL    buffer_zero_sse2
-#endif
-
-static unsigned used_accel = INIT_USED;
-static unsigned length_to_accel = INIT_LENGTH;
-static bool (*buffer_accel)(const void *, size_t) = INIT_ACCEL;
-
 static unsigned __attribute__((noinline))
 select_accel_cpuinfo(unsigned info)
 {
     /* Array is sorted in order of algorithm preference. */
     static const struct {
         unsigned bit;
-        unsigned len;
         bool (*fn)(const void *, size_t);
     } all[] = {
 #ifdef CONFIG_AVX512F_OPT
-        { CPUINFO_AVX512F, 256, buffer_zero_avx512 },
+        { CPUINFO_AVX512F, buffer_zero_avx512 },
 #endif
 #ifdef CONFIG_AVX2_OPT
-        { CPUINFO_AVX2,    128, buffer_zero_avx2 },
+        { CPUINFO_AVX2,    buffer_zero_avx2 },
 #endif
-        { CPUINFO_SSE2,     64, buffer_zero_sse2 },
-        { CPUINFO_ALWAYS,    0, buffer_zero_int },
+        { CPUINFO_SSE2,    buffer_zero_sse2 },
+        { CPUINFO_ALWAYS,  buffer_is_zero_len_4_plus },
     };
 
     for (unsigned i = 0; i < ARRAY_SIZE(all); ++i) {
         if (info & all[i].bit) {
-            length_to_accel = all[i].len;
-            buffer_accel = all[i].fn;
+            buffer_is_zero_len_256_plus = all[i].fn;
             return all[i].bit;
         }
     }
     return 0;
 }
 
+static unsigned used_accel
+#if defined(__SSE2__)
+    = CPUINFO_SSE2;
+#else
+    = 0;
+#endif
+
 #if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT)
 static void __attribute__((constructor)) init_accel(void)
 {
@@ -227,35 +210,16 @@ bool test_buffer_is_zero_next_accel(void)
     return used;
 }
 
-static bool select_accel_fn(const void *buf, size_t len)
-{
-    if (likely(len >= length_to_accel)) {
-        return buffer_accel(buf, len);
-    }
-    return buffer_zero_int(buf, len);
-}
-
 #else
-#define select_accel_fn  buffer_zero_int
 bool test_buffer_is_zero_next_accel(void)
 {
     return false;
 }
 #endif
 
-/*
- * Checks if a buffer is all zeroes
- */
-bool buffer_is_zero(const void *buf, size_t len)
-{
-    if (unlikely(len == 0)) {
-        return true;
-    }
-
-    /* Fetch the beginning of the buffer while we select the accelerator.  */
-    __builtin_prefetch(buf);
-
-    /* Use an optimized zero check if possible.  Note that this also
-       includes a check for an unrolled loop over 64-bit integers.  */
-    return select_accel_fn(buf, len);
-}
+bool (*buffer_is_zero_len_256_plus)(const void *, size_t)
+#if defined(__SSE2__)
+    = buffer_zero_sse2;
+#else
+    = buffer_is_zero_len_4_plus;
+#endif
-- 
2.32.0


Reply via email to