With the TCG_REG_RA base, we can form any 2G displacement for 64-bit, which means we only need 4 insns for the 64-bit target instead of 7.
Generate the mtctr and brctr insns once during translation and not part of ppc_tb_set_jmp_target. This means we can update only 2 insns, and we can arrange to do that update atomically. Signed-off-by: Richard Henderson <r...@twiddle.net> --- tcg/ppc/tcg-target.c | 56 +++++++++++++++++++++++++++++++++++++++++----------- translate-all.c | 2 ++ 2 files changed, 47 insertions(+), 11 deletions(-) diff --git a/tcg/ppc/tcg-target.c b/tcg/ppc/tcg-target.c index c83fd9f..61a487d 100644 --- a/tcg/ppc/tcg-target.c +++ b/tcg/ppc/tcg-target.c @@ -1244,11 +1244,41 @@ static void tcg_out_brcond2 (TCGContext *s, const TCGArg *args, void ppc_tb_set_jmp_target(uintptr_t jmp_addr, uintptr_t addr) { - TCGContext s; + tcg_insn_unit insn1, insn2; + uint64_t pair; + + if (in_range_b(addr - jmp_addr)) { + insn1 = B | ((addr - jmp_addr) & 0x3fffffc); + insn2 = NOP; + } else if (TCG_TARGET_REG_BITS == 32 || addr == (int32_t)addr) { + insn1 = ADDIS | TAI(TCG_REG_TMP1, 0, addr >> 16); + insn2 = ORI | TAI(TCG_REG_R0, TCG_REG_TMP1, addr); + } else { + intptr_t diff = addr - (uintptr_t)tb_ret_addr; + int16_t lo = diff; + int32_t hi = diff - lo; + + assert(hi + lo == diff); + insn1 = ADDIS | TAI(TCG_REG_TMP1, TCG_REG_RA, hi >> 16); + insn2 = ADDI | TAI(TCG_REG_R0, TCG_REG_TMP1, lo); + } + +#ifdef HOST_WORDS_BIGENDIAN + pair = (uint64_t)insn1 << 32 | insn2; +#else + pair = (uint64_t)insn2 << 32 | insn1; +#endif + + if (TCG_TARGET_REG_BITS == 64) { + *(uint64_t *)jmp_addr = pair; + } else { + /* In 32-bit mode, we've got to use the fpu to + atomically store 8 bytes. */ + /* ??? With gcc 4.8+, we could use __atomic_store. */ + __asm volatile("stfdx %0,0,%1" : : "f"(pair), "r"(jmp_addr)); + } - s.code_buf = s.code_ptr = (tcg_insn_unit *)jmp_addr; - tcg_out_b(&s, 0, (tcg_insn_unit *)addr); - flush_icache_range(jmp_addr, jmp_addr + tcg_current_code_size(&s)); + flush_icache_range(jmp_addr, jmp_addr + 8); } static void tcg_out_call(TCGContext *s, tcg_insn_unit *target) @@ -1851,14 +1881,18 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, tcg_out_b(s, 0, tb_ret_addr); break; case INDEX_op_goto_tb: - if (s->tb_jmp_offset) { - /* Direct jump method. */ - s->tb_jmp_offset[args[0]] = tcg_current_code_size(s); - s->code_ptr += 7; - } else { - /* Indirect jump method. */ - tcg_abort(); + assert(s->tb_jmp_offset != NULL); + /* Direct jump method. */ + /* Align the two insns so we can update them atomically. */ + if ((intptr_t)s->code_ptr & 4) { + tcg_out32(s, NOP); } + s->tb_jmp_offset[args[0]] = tcg_current_code_size(s); + /* Max 2 insns to compose address into R0. */ + s->code_ptr += 2; + /* Branch to the address computed. */ + tcg_out32(s, MTSPR | RS(TCG_REG_R0) | CTR); + tcg_out32(s, BCCTR | BO_ALWAYS); s->tb_next_offset[args[0]] = tcg_current_code_size(s); break; case INDEX_op_br: diff --git a/translate-all.c b/translate-all.c index 5549a85..6bd8fe8 100644 --- a/translate-all.c +++ b/translate-all.c @@ -466,6 +466,8 @@ static inline PageDesc *page_find(tb_page_addr_t index) host cpu, as used by the TCG implementation of goto_tb. */ #if defined(__x86_64__) # define MAX_CODE_GEN_BUFFER_SIZE (2ul * 1024 * 1024 * 1024) +#elif defined(_ARCH_PPC) +# define MAX_CODE_GEN_BUFFER_SIZE (2ul * 1024 * 1024 * 1024) #elif defined(__sparc__) # define MAX_CODE_GEN_BUFFER_SIZE (2ul * 1024 * 1024 * 1024) #elif defined(__aarch64__) -- 1.9.3