[Qemu-devel] [RFC PATCH v1 2/2] utils: Add prefetch for Thunderx platform

vijay . kilari Tue, 02 Aug 2016 03:24:31 -0700

From: Vijaya Kumar K <vijaya.ku...@cavium.com>

Thunderx pass2 chip requires explicit prefetch
instruction to give prefetch hint.


To speed up live migration on Thunderx platform,
prefetch instruction is added in zero buffer check
function.

The below results show live migration time improvement
with prefetch instruction with 1K and 4K page size.
VM with 4 VCPUs, 8GB RAM is migrated.

1K page size, no prefetch
=========================
Migration status: completed
total time: 13012 milliseconds
downtime: 10 milliseconds
setup: 15 milliseconds
transferred ram: 268131 kbytes
throughput: 168.84 mbps
remaining ram: 0 kbytes
total ram: 8519872 kbytes
duplicate: 8338072 pages
skipped: 0 pages
normal: 193335 pages
normal bytes: 193335 kbytes
dirty sync count: 4

1K page size with prefetch
=========================
Migration status: completed
total time: 7493 milliseconds
downtime: 71 milliseconds
setup: 16 milliseconds
transferred ram: 269666 kbytes
throughput: 294.88 mbps
remaining ram: 0 kbytes
total ram: 8519872 kbytes
duplicate: 8340596 pages
skipped: 0 pages
normal: 194837 pages
normal bytes: 194837 kbytes
dirty sync count: 3

4K page size with no prefetch
=============================
Migration status: completed
total time: 10456 milliseconds
downtime: 49 milliseconds
setup: 5 milliseconds
transferred ram: 231726 kbytes
throughput: 181.59 mbps
remaining ram: 0 kbytes
total ram: 8519872 kbytes
duplicate: 2079914 pages
skipped: 0 pages
normal: 53257 pages
normal bytes: 213028 kbytes
dirty sync count: 3

4K page size with prefetch
==========================
Migration status: completed
total time: 3937 milliseconds
downtime: 23 milliseconds
setup: 5 milliseconds
transferred ram: 229283 kbytes
throughput: 477.19 mbps
remaining ram: 0 kbytes
total ram: 8519872 kbytes
duplicate: 2079775 pages
skipped: 0 pages
normal: 52648 pages
normal bytes: 210592 kbytes
dirty sync count: 3

Signed-off-by: Vijaya Kumar K <vijaya.ku...@cavium.com>
---
 include/qemu-common.h |    1 +
 util/cpuinfo.c        |   38 ++++++++++++++++++++++++++++++++++++++
 util/cutils.c         |   22 ++++++++++++++++++++++
 3 files changed, 61 insertions(+)

diff --git a/include/qemu-common.h b/include/qemu-common.h
index 62ad674..3d8a32c 100644
--- a/include/qemu-common.h
+++ b/include/qemu-common.h
@@ -135,4 +135,5 @@ void page_size_init(void);
 bool dump_in_progress(void);
 
 long int qemu_read_cpuid_info(void);
+bool is_thunder_pass2_cpu(void);
 #endif
diff --git a/util/cpuinfo.c b/util/cpuinfo.c
index 3ba7194..0e72a34 100644
--- a/util/cpuinfo.c
+++ b/util/cpuinfo.c
@@ -16,6 +16,26 @@
 
 #if defined(__aarch64__)
 
+#define MIDR_IMPLEMENTER_SHIFT  24
+#define MIDR_IMPLEMENTER_MASK   (0xffULL << MIDR_IMPLEMENTER_SHIFT)
+#define MIDR_ARCHITECTURE_SHIFT 16
+#define MIDR_ARCHITECTURE_MASK  (0xf << MIDR_ARCHITECTURE_SHIFT)
+#define MIDR_PARTNUM_SHIFT      4
+#define MIDR_PARTNUM_MASK       (0xfff << MIDR_PARTNUM_SHIFT)
+
+#define MIDR_CPU_PART(imp, partnum) \
+        (((imp)                 << MIDR_IMPLEMENTER_SHIFT)  | \
+        (0xf                    << MIDR_ARCHITECTURE_SHIFT) | \
+        ((partnum)              << MIDR_PARTNUM_SHIFT))
+
+#define ARM_CPU_IMP_CAVIUM        0x43
+#define CAVIUM_CPU_PART_THUNDERX  0x0A1
+
+#define MIDR_THUNDERX  \
+               MIDR_CPU_PART(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX)
+#define CPU_MODEL_MASK  (MIDR_IMPLEMENTER_MASK | MIDR_ARCHITECTURE_MASK | \
+                         MIDR_PARTNUM_MASK)
+
 long int qemu_read_cpuid_info(void)
 {
     FILE *fp;
@@ -49,4 +69,22 @@ out:
 
     return midr;
 }
+
+bool is_thunder_pass2_cpu(void)
+{
+    static bool cpu_info_read;
+    static long int midr_thunder_val;
+
+    if (!cpu_info_read) {
+        midr_thunder_val = qemu_read_cpuid_info();
+        midr_thunder_val &= CPU_MODEL_MASK;
+        cpu_info_read = 1;
+    }
+
+    if (midr_thunder_val == MIDR_THUNDERX) {
+        return 1;
+    } else {
+        return 0;
+    }
+}
 #endif
diff --git a/util/cutils.c b/util/cutils.c
index 7505fda..66c816b 100644
--- a/util/cutils.c
+++ b/util/cutils.c
@@ -191,6 +191,8 @@ int qemu_fdatasync(int fd)
         ((vgetq_lane_u64(v1, 0) == vgetq_lane_u64(v2, 0)) && \
          (vgetq_lane_u64(v1, 1) == vgetq_lane_u64(v2, 1)))
 #define VEC_OR(v1, v2) ((v1) | (v2))
+#define VEC_PREFETCH(base, index) \
+        asm volatile ("prfm pldl1strm, [%x[a]]\n" : : [a]"r"(&base[(index)]))
 #else
 #define VECTYPE        unsigned long
 #define SPLAT(p)       (*(p) * (~0UL / 255))
@@ -233,6 +235,9 @@ static size_t buffer_find_nonzero_offset_inner(const void 
*buf, size_t len)
     const VECTYPE *p = buf;
     const VECTYPE zero = (VECTYPE){0};
     size_t i;
+#if defined (__aarch64__)
+    bool do_prefetch;
+#endif
 
     assert(can_use_buffer_find_nonzero_offset_inner(buf, len));
 
@@ -246,9 +251,26 @@ static size_t buffer_find_nonzero_offset_inner(const void 
*buf, size_t len)
         }
     }
 
+#if defined (__aarch64__)
+    do_prefetch = is_thunder_pass2_cpu();
+    if (do_prefetch) {
+        VEC_PREFETCH(p, 8);
+        VEC_PREFETCH(p, 16);
+        VEC_PREFETCH(p, 24);
+    }
+#endif
+
     for (i = BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR;
          i < len / sizeof(VECTYPE);
          i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR) {
+
+#if defined (__aarch64__)
+        if (do_prefetch) {
+            VEC_PREFETCH(p, i+32);
+            VEC_PREFETCH(p, i+40);
+        }
+#endif
+
         VECTYPE tmp0 = VEC_OR(p[i + 0], p[i + 1]);
         VECTYPE tmp1 = VEC_OR(p[i + 2], p[i + 3]);
         VECTYPE tmp2 = VEC_OR(p[i + 4], p[i + 5]);
-- 
1.7.9.5

[Qemu-devel] [RFC PATCH v1 2/2] utils: Add prefetch for Thunderx platform

Reply via email to