SC using cmpxchg helpers

Emilio G. Cota Mon, 27 Jun 2016 12:39:59 -0700

Emulating LL/SC with cmpxchg is not correct, since it can
suffer from the ABA problem. Portable parallel code, however,
is written assuming only cmpxchg--and not LL/SC--is available.
This means that in practice emulating LL/SC with cmpxchg is
a viable alternative.


The appended emulates LL/SC pairs in aarch64 with cmpxchg helpers.
This works in both user and system mode. In usermode, it avoids
pausing all other CPUs to perform the LL/SC pair. The subsequent
performance and scalability improvement is significant, as the
plots below show. They plot the throughput of atomic_add-bench
compiled for ARM and executed on a 64-core x86 machine.

Hi-res plots: http://imgur.com/a/JVc8Y

                atomic_add-bench: 1000000 ops/thread, [0,1] range

  18 ++---------+----------+---------+----------+----------+----------+---++
     +cmpxchg +-E--+       +         +          +          +          +    |
  16 ++master +-H--+                                                      ++
     ||                                                                    |
  14 ++                                                                   ++
     | |                                                                   |
  12 ++|                                                                  ++
     | |                                                                   |
  10 ++++                                                                 ++
   8 ++E                                                                  ++
     |+++                                                                  |
   6 ++ |                                                                 ++
     |  |                                                                  |
   4 ++ |                                                                 ++
     |   |                                                                 |
   2 +H++E+---                                                            ++
     + |     +E++----+E+---+--+E+----++E+------+E+------+E++----+E+---+--+E|
   0 ++H-H----H-+-----H----+---------+----------+----------+----------+---++
     0          10         20        30         40         50         60
                                Number of threads

                atomic_add-bench: 1000000 ops/thread, [0,2] range

  18 ++---------+----------+---------+----------+----------+----------+---++
     +cmpxchg +-E--+       +         +          +          +          +    |
  16 ++master +-H--+                                                      ++
     | |                                                                   |
  14 ++E                                                                  ++
     | |                                                                   |
  12 ++|                                                                  ++
     |+++                                                                  |
  10 ++ |                                                                 ++
   8 ++ |                                                                 ++
     |  |                                                                  |
   6 ++ |                                                                 ++
     |   |                                                                 |
   4 ++  |                                                                ++
     |  +E+---                                                             |
   2 +H+     +E+-----+++              +++      +++   ---+E+-----+E+------+++
     +++        +    +E+---+--+E+----++E+------+E+---   ++++    +++   +  +E|
   0 ++H-H----H-+-----H----+---------+----------+----------+----------+---++
     0          10         20        30         40         50         60
                                Number of threads

               atomic_add-bench: 1000000 ops/thread, [0,128] range

  70 ++---------+----------+---------+----------+----------+----------+---++
     +cmpxchg +-E--+       +         +          +          +          +    |
  60 ++master +-H--+                  +++            ---+E+-----+E+------+E+
     |                        +E+------E-------+E+---                      |
     |                     ---        +++                                  |
  50 ++              +++---                                               ++
     |              -+E+                                                   |
  40 ++      +++----                                                      ++
     |        E-                                                           |
     |      --|                                                            |
  30 ++   -- +++                                                          ++
     |  +E+                                                                |
  20 ++E+                                                                 ++
     |E+                                                                   |
     |                                                                     |
  10 ++                                                                   ++
     +          +          +         +          +          +          +    |
   0 +HH-H----H-+-----H----+---------+----------+----------+----------+---++
     0          10         20        30         40         50         60
                                Number of threads

              atomic_add-bench: 1000000 ops/thread, [0,1024] range

  160 ++---------+---------+----------+---------+----------+----------+---++
      +cmpxchg +-E--+      +          +         +          +          +    |
  140 ++master +-H--+                                           +++      +++
      |                                                -+E+-----+E+-------E|
  120 ++                                       +++ ----                  +++
      |                                +++  ----E--                        |
  100 ++                              --E---   +++                        ++
      |                       +++ ---- +++                                 |
   80 ++                     --E--                                        ++
      |                  ---- +++                                          |
      |              -+E+                                                  |
   60 ++         ---- +++                                                 ++
      |      +E+-                                                          |
   40 ++   --                                                             ++
      |  +E+                                                               |
   20 +EE+                                                                ++
      +++        +         +          +         +          +          +    |
    0 +HH-H---H--+-----H---+----------+---------+----------+----------+---++
      0          10        20         30        40         50         60
                                Number of threads

Signed-off-by: Emilio G. Cota <c...@braap.org>
---
 target-arm/translate-a64.c | 79 ++++++++++++++++++----------------------------
 1 file changed, 30 insertions(+), 49 deletions(-)

diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index f5e29d2..1676519 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -1754,17 +1754,6 @@ static void disas_b_exc_sys(DisasContext *s, uint32_t 
insn)
     }
 }
 
-/*
- * Load/Store exclusive instructions are implemented by remembering
- * the value/address loaded, and seeing if these are the same
- * when the store is performed. This is not actually the architecturally
- * mandated semantics, but it works for typical guest code sequences
- * and avoids having to monitor regular stores.
- *
- * In system emulation mode only one CPU will be running at once, so
- * this sequence is effectively atomic.  In user emulation mode we
- * throw an exception and handle the atomic operation elsewhere.
- */
 static void gen_load_exclusive(DisasContext *s, int rt, int rt2,
                                TCGv_i64 addr, int size, bool is_pair)
 {
@@ -1794,16 +1783,6 @@ static void gen_load_exclusive(DisasContext *s, int rt, 
int rt2,
     tcg_gen_mov_i64(cpu_exclusive_addr, addr);
 }
 
-#ifdef CONFIG_USER_ONLY
-static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2,
-                                TCGv_i64 addr, int size, int is_pair)
-{
-    tcg_gen_mov_i64(cpu_exclusive_test, addr);
-    tcg_gen_movi_i32(cpu_exclusive_info,
-                     size | is_pair << 2 | (rd << 4) | (rt << 9) | (rt2 << 
14));
-    gen_exception_internal_insn(s, 4, EXCP_STREX);
-}
-#else
 static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2,
                                 TCGv_i64 inaddr, int size, int is_pair)
 {
@@ -1831,46 +1810,48 @@ static void gen_store_exclusive(DisasContext *s, int 
rd, int rt, int rt2,
     tcg_gen_brcond_i64(TCG_COND_NE, addr, cpu_exclusive_addr, fail_label);
 
     tmp = tcg_temp_new_i64();
-    tcg_gen_qemu_ld_i64(tmp, addr, get_mem_index(s), s->be_data + size);
-    tcg_gen_brcond_i64(TCG_COND_NE, tmp, cpu_exclusive_val, fail_label);
-    tcg_temp_free_i64(tmp);
-
     if (is_pair) {
-        TCGv_i64 addrhi = tcg_temp_new_i64();
-        TCGv_i64 tmphi = tcg_temp_new_i64();
-
-        tcg_gen_addi_i64(addrhi, addr, 1 << size);
-        tcg_gen_qemu_ld_i64(tmphi, addrhi, get_mem_index(s),
-                            s->be_data + size);
-        tcg_gen_brcond_i64(TCG_COND_NE, tmphi, cpu_exclusive_high, fail_label);
-
-        tcg_temp_free_i64(tmphi);
-        tcg_temp_free_i64(addrhi);
-    }
-
-    /* We seem to still have the exclusive monitor, so do the store */
-    tcg_gen_qemu_st_i64(cpu_reg(s, rt), addr, get_mem_index(s),
-                        s->be_data + size);
-    if (is_pair) {
-        TCGv_i64 addrhi = tcg_temp_new_i64();
+        if (size == 2) {
+            gen_helper_paired_cmpxchg64l(tmp, cpu_env, addr, cpu_exclusive_val,
+                                         cpu_exclusive_high,
+                                         cpu_reg(s, rt), cpu_reg(s, rt2));
+        } else {
+            gen_helper_paired_cmpxchg64q(tmp, cpu_env, addr, cpu_exclusive_val,
+                                         cpu_exclusive_high, cpu_reg(s, rt),
+                                         cpu_reg(s, rt2));
+        }
+    } else {
+        TCGv_i64 val = cpu_reg(s, rt);
 
-        tcg_gen_addi_i64(addrhi, addr, 1 << size);
-        tcg_gen_qemu_st_i64(cpu_reg(s, rt2), addrhi,
-                            get_mem_index(s), s->be_data + size);
-        tcg_temp_free_i64(addrhi);
+        switch (size) {
+        case 0:
+            gen_helper_cmpxchg64b(tmp, cpu_env, addr, cpu_exclusive_val, val);
+            break;
+        case 1:
+            gen_helper_cmpxchg64w(tmp, cpu_env, addr, cpu_exclusive_val, val);
+            break;
+        case 2:
+            gen_helper_cmpxchg64l(tmp, cpu_env, addr, cpu_exclusive_val, val);
+            break;
+        case 3:
+            gen_helper_cmpxchg64q(tmp, cpu_env, addr, cpu_exclusive_val, val);
+            break;
+        default:
+            abort();
+        }
     }
 
     tcg_temp_free_i64(addr);
 
-    tcg_gen_movi_i64(cpu_reg(s, rd), 0);
+    tcg_gen_mov_i64(cpu_reg(s, rd), tmp);
+    tcg_temp_free_i64(tmp);
     tcg_gen_br(done_label);
+
     gen_set_label(fail_label);
     tcg_gen_movi_i64(cpu_reg(s, rd), 1);
     gen_set_label(done_label);
     tcg_gen_movi_i64(cpu_exclusive_addr, -1);
-
 }
-#endif
 
 /* Update the Sixty-Four bit (SF) registersize. This logic is derived
  * from the ARMv8 specs for LDR (Shared decode for all encodings).
-- 
2.5.0

[Qemu-devel] [RFC 27/30] target-arm: emulate aarch64's LL/SC using cmpxchg helpers

Reply via email to