v2: use stq to replace all instructions atomically
tcg/ppc/tcg-target.c.inc | 105 +++++++++++++++++++++++++++------------
1 file changed, 74 insertions(+), 31 deletions(-)
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index 1cbd047ab3..0cde11c3de 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -1847,44 +1847,87 @@ static void tcg_out_mb(TCGContext *s, TCGArg a0)
tcg_out32(s, insn);
}
-void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
- uintptr_t jmp_rw, uintptr_t addr)
+static inline uint64_t make_pair(tcg_insn_unit i1, tcg_insn_unit i2)
{
- if (TCG_TARGET_REG_BITS == 64) {
- tcg_insn_unit i1, i2;
- intptr_t tb_diff = addr - tc_ptr;
- intptr_t br_diff = addr - (jmp_rx + 4);
- uint64_t pair;
-
- /* This does not exercise the range of the branch, but we do
- still need to be able to load the new value of TCG_REG_TB.
- But this does still happen quite often. */
- if (tb_diff == (int16_t)tb_diff) {
- i1 = ADDI | TAI(TCG_REG_TB, TCG_REG_TB, tb_diff);
- i2 = B | (br_diff & 0x3fffffc);
- } else {
- intptr_t lo = (int16_t)tb_diff;
- intptr_t hi = (int32_t)(tb_diff - lo);
- assert(tb_diff == hi + lo);
- i1 = ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, hi >> 16);
- i2 = ADDI | TAI(TCG_REG_TB, TCG_REG_TB, lo);
- }
-#if HOST_BIG_ENDIAN
- pair = (uint64_t)i1 << 32 | i2;
+ if (HOST_BIG_ENDIAN) {
+ return (uint64_t)i1 << 32 | i2;
+ }
+ return (uint64_t)i2 << 32 | i1;
+}
+
+static inline void ppc64_replace2(uintptr_t rx, uintptr_t rw,
+ tcg_insn_unit i0, tcg_insn_unit i1)
+{
+#if TCG_TARGET_REG_BITS == 64
+ qatomic_set((uint64_t *)rw, make_pair(i0, i1));
+ flush_idcache_range(rx, rw, 8);
#else
- pair = (uint64_t)i2 << 32 | i1;
+ qemu_build_not_reached();
#endif
+}
- /* As per the enclosing if, this is ppc64. Avoid the _Static_assert
- within qatomic_set that would fail to build a ppc32 host. */
- qatomic_set__nocheck((uint64_t *)jmp_rw, pair);
- flush_idcache_range(jmp_rx, jmp_rw, 8);
- } else {
+static inline void ppc64_replace4(uintptr_t rx, uintptr_t rw,
+ tcg_insn_unit i0, tcg_insn_unit i1, tcg_insn_unit i2, tcg_insn_unit i3)
+{
+ uint64_t p[2];
+
+ p[!HOST_BIG_ENDIAN] = make_pair(i0, i1);
+ p[HOST_BIG_ENDIAN] = make_pair(i2, i3);
+
+ asm("mr %%r6, %1\n\t"
+ "mr %%r7, %2\n\t"
+ "stq %%r6, %0"
+ : "=Q"(*(__int128 *)rw) : "r"(p[0]), "r"(p[1]) : "r6", "r7");
+ flush_idcache_range(rx, rw, 16);
+}
+
+void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
+ uintptr_t jmp_rw, uintptr_t addr)
+{
+ tcg_insn_unit i0, i1, i2, i3;
+ intptr_t tb_diff = addr - tc_ptr;
+ intptr_t br_diff = addr - (jmp_rx + 4);
+ intptr_t lo, hi;
+
+ if (TCG_TARGET_REG_BITS == 32) {
intptr_t diff = addr - jmp_rx;
tcg_debug_assert(in_range_b(diff));
qatomic_set((uint32_t *)jmp_rw, B | (diff & 0x3fffffc));
flush_idcache_range(jmp_rx, jmp_rw, 4);
+ return;
+ }
+
+ /*
+ * This does not exercise the range of the branch, but we do
+ * still need to be able to load the new value of TCG_REG_TB.
+ * But this does still happen quite often.
+ */
+ if (tb_diff == (int16_t)tb_diff) {
+ i0 = ADDI | TAI(TCG_REG_TB, TCG_REG_TB, tb_diff);
+ i1 = B | (br_diff & 0x3fffffc);
+ ppc64_replace2(jmp_rx, jmp_rw, i0, i1);
+ return;
+ }
+
+ lo = (int16_t)tb_diff;
+ hi = (int32_t)(tb_diff - lo);
+ assert(tb_diff == hi + lo);
+ i0 = ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, hi >> 16);
+ i1 = ADDI | TAI(TCG_REG_TB, TCG_REG_TB, lo);
+ if (!have_isa_2_07) {
+ ppc64_replace2(jmp_rx, jmp_rw, i0, i1);
+ return;
+ }
+
+ br_diff -= 4;
+ if (in_range_b(br_diff)) {
+ i2 = B | (br_diff & 0x3fffffc);
+ i3 = NOP;
+ } else {
+ i2 = MTSPR | RS(TCG_REG_TB) | CTR;
+ i3 = BCCTR | BO_ALWAYS;
}
+ ppc64_replace4(jmp_rx, jmp_rw, i0, i1, i2, i3);
}
static void tcg_out_call_int(TCGContext *s, int lk,
@@ -2574,8 +2617,8 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
if (s->tb_jmp_insn_offset) {
/* Direct jump. */
if (TCG_TARGET_REG_BITS == 64) {
- /* Ensure the next insns are 8-byte aligned. */
- if ((uintptr_t)s->code_ptr & 7) {
+ /* Ensure the next insns are 16-byte aligned. */
+ while ((uintptr_t)s->code_ptr & 15) {
tcg_out32(s, NOP);
}
s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s);