Using compact branches, when possible, avoids a delay slot nop. Signed-off-by: Richard Henderson <r...@twiddle.net> --- include/elf.h | 4 + tcg/mips/tcg-target.c | 270 +++++++++++++++++++++++++++++++++++++++----------- 2 files changed, 218 insertions(+), 56 deletions(-)
diff --git a/include/elf.h b/include/elf.h index 1098d21..6e52ba0 100644 --- a/include/elf.h +++ b/include/elf.h @@ -352,6 +352,10 @@ typedef struct { #define R_MIPS_CALLHI16 30 #define R_MIPS_CALLLO16 31 /* + * Incomplete list of MIPS R6 relocation types. + */ +#define R_MIPS_PC26_S2 61 +/* * This range is reserved for vendor specific relocations. */ #define R_MIPS_LOVENDOR 100 diff --git a/tcg/mips/tcg-target.c b/tcg/mips/tcg-target.c index ab72715..447dbcf 100644 --- a/tcg/mips/tcg-target.c +++ b/tcg/mips/tcg-target.c @@ -154,6 +154,19 @@ static inline void reloc_pc16(tcg_insn_unit *pc, tcg_insn_unit *target) *pc = deposit32(*pc, 0, 16, reloc_pc16_val(pc, target)); } +static inline uint32_t reloc_pc26_val(tcg_insn_unit *pc, tcg_insn_unit *target) +{ + /* Let the compiler perform the right-shift as part of the arithmetic. */ + ptrdiff_t disp = target - (pc + 1); + tcg_debug_assert(disp == sextract32(disp, 0, 26)); + return disp & 0x1ffffff; +} + +static inline void reloc_pc26(tcg_insn_unit *pc, tcg_insn_unit *target) +{ + *pc = deposit32(*pc, 0, 26, reloc_pc16_val(pc, target)); +} + static inline uint32_t reloc_26_val(tcg_insn_unit *pc, tcg_insn_unit *target) { tcg_debug_assert((((uintptr_t)pc ^ (uintptr_t)target) & 0xf0000000) == 0); @@ -168,9 +181,17 @@ static inline void reloc_26(tcg_insn_unit *pc, tcg_insn_unit *target) static void patch_reloc(tcg_insn_unit *code_ptr, int type, intptr_t value, intptr_t addend) { - tcg_debug_assert(type == R_MIPS_PC16); tcg_debug_assert(addend == 0); - reloc_pc16(code_ptr, (tcg_insn_unit *)value); + switch (type) { + case R_MIPS_PC16: + reloc_pc16(code_ptr, (tcg_insn_unit *)value); + break; + case R_MIPS_PC26_S2: + reloc_pc26(code_ptr, (tcg_insn_unit *)value); + break; + default: + tcg_abort(); + } } #define TCG_CT_CONST_ZERO 0x100 @@ -288,7 +309,10 @@ typedef enum { OPC_BEQ = 004 << 26, OPC_BNE = 005 << 26, OPC_BLEZ = 006 << 26, + OPC_BGEUC = OPC_BLEZ, /* R6: rs != 0, rt != 0, rs != rt */ OPC_BGTZ = 007 << 26, + OPC_BLTUC = OPC_BGTZ, /* R6: rs != 0, rt != 0, rs != rt */ + OPC_BEQC = 010 << 26, /* R6: rs > rt */ OPC_ADDIU = 011 << 26, OPC_SLTI = 012 << 26, OPC_SLTIU = 013 << 26, @@ -297,6 +321,9 @@ typedef enum { OPC_XORI = 016 << 26, OPC_LUI = 017 << 26, OPC_AUI = OPC_LUI, + OPC_BGEC = 026 << 26, + OPC_BLTC = 027 << 26, + OPC_BNEC = 030 << 26, /* R6: rs > rt */ OPC_DADDIU = 031 << 26, OPC_DAUI = 035 << 26, OPC_LB = 040 << 26, @@ -308,6 +335,7 @@ typedef enum { OPC_SB = 050 << 26, OPC_SH = 051 << 26, OPC_SW = 053 << 26, + OPC_BC = 062 << 26, OPC_LD = 067 << 26, OPC_SD = 077 << 26, @@ -421,6 +449,33 @@ typedef enum { ? OPC_SRL : OPC_DSRL, } MIPSInsn; +static inline void tcg_out_nop(TCGContext *s) +{ + tcg_out32(s, 0); +} + +/* + * Protect against a mips r6 forbidden slot. If the previously + * emitted insn has a forbidden slot, emit a nop. + */ + +static void nop_forbidden_slot(TCGContext *s) +{ + if (use_mips32r6_instructions && s->code_ptr > s->code_buf) { + tcg_insn_unit prev = s->code_ptr[-1]; + switch (prev & 0xfc000000u) { + case OPC_BGEC: + case OPC_BLTC: + case OPC_BGEUC: + case OPC_BLTUC: + case OPC_BEQC: + case OPC_BNEC: + tcg_out_nop(s); + break; + } + } +} + /* * Type reg */ @@ -496,17 +551,30 @@ static inline void tcg_out_opc_bf64(TCGContext *s, MIPSInsn opc, MIPSInsn opm, /* * Type branch */ -static inline void tcg_out_opc_br(TCGContext *s, MIPSInsn opc, - TCGReg rt, TCGReg rs) +static void tcg_out_opc_br(TCGContext *s, MIPSInsn opc, TCGReg rt, TCGReg rs) { + uint16_t offset; + + nop_forbidden_slot(s); + /* We pay attention here to not modify the branch target by reading the existing value and using it again. This ensure that caches and memory are kept coherent during retranslation. */ - uint16_t offset = (uint16_t)*s->code_ptr; - + offset = (uint16_t)*s->code_ptr; tcg_out_opc_imm(s, opc, rt, rs, offset); } +static void tcg_out_opc_br_pc16(TCGContext *s, MIPSInsn opc, + TCGReg rt, TCGReg rs, TCGLabel *l) +{ + tcg_out_opc_br(s, opc, rt, rs); + if (l->has_value) { + reloc_pc16(s->code_ptr - 1, l->u.value_ptr); + } else { + tcg_out_reloc(s, s->code_ptr - 1, R_MIPS_PC16, l, 0); + } +} + /* * Type sa */ @@ -552,17 +620,16 @@ static bool tcg_out_opc_jmp(TCGContext *s, MIPSInsn opc, void *target) } tcg_debug_assert((dest & 3) == 0); + /* Since the entire code gen buffer is within the 256MB region, + potentially adding a nop doesn't affect the test above. */ + nop_forbidden_slot(s); + inst = opc; inst |= (dest >> 2) & 0x3ffffff; tcg_out32(s, inst); return true; } -static inline void tcg_out_nop(TCGContext *s) -{ - tcg_out32(s, 0); -} - static inline void tcg_out_dsll(TCGContext *s, TCGReg rd, TCGReg rt, TCGArg sa) { tcg_out_opc_sa64(s, OPC_DSLL, OPC_DSLL32, rd, rt, sa); @@ -1018,59 +1085,136 @@ static void tcg_out_brcond(TCGContext *s, TCGCond cond, TCGReg arg1, [TCG_COND_GE] = OPC_BGEZ, }; - MIPSInsn s_opc = OPC_SLTU; - MIPSInsn b_opc; + MIPSInsn b_opc = 0; + bool compact = false; + TCGReg rs = arg1, rt = arg2; int cmp_map; - switch (cond) { - case TCG_COND_EQ: - b_opc = OPC_BEQ; - break; - case TCG_COND_NE: - b_opc = OPC_BNE; - break; + /* We shouldn't expect to have arg1 == arg2, as the TCG optimizer + should have eliminated all such. However, the R6 encodings do + not allow this situation, so e.g. if the optimizer is disabled + we must fall back to normal compares. */ + if (use_mips32r6_instructions && arg1 != arg2) { + switch (cond) { + case TCG_COND_EQ: + case TCG_COND_NE: + if (rs < rt) { + rs = arg2, rt = arg1; + } + b_opc = cond == TCG_COND_EQ ? OPC_BEQC : OPC_BNEC; + compact = true; + break; - case TCG_COND_LT: - case TCG_COND_GT: - case TCG_COND_LE: - case TCG_COND_GE: - if (arg2 == 0) { - b_opc = b_zero[cond]; - arg2 = arg1; - arg1 = 0; + case TCG_COND_LE: + case TCG_COND_GT: + /* Swap arguments to turn LE to GE or GT to LT. + This also produces BLEZC/BGTZC when arg2 = 0. */ + rs = arg2, rt = arg1; + if (rt == TCG_REG_ZERO) { + break; + } + b_opc = cond == TCG_COND_LE ? OPC_BGEC : OPC_BLTC; + compact = true; break; - } - s_opc = OPC_SLT; - /* FALLTHRU */ - case TCG_COND_LTU: - case TCG_COND_GTU: - case TCG_COND_LEU: - case TCG_COND_GEU: - cmp_map = mips_cmp_map[cond]; - if (cmp_map & MIPS_CMP_SWAP) { - TCGReg t = arg1; - arg1 = arg2; - arg2 = t; + case TCG_COND_GE: + case TCG_COND_LT: + if (rs == TCG_REG_ZERO) { + break; + } + /* The encoding of BGEZC/BLTZC requires rs = rt. */ + if (rt == TCG_REG_ZERO) { + rt = rs; + } + b_opc = cond == TCG_COND_GE ? OPC_BGEC : OPC_BLTC; + compact = true; + break; + + case TCG_COND_LEU: + /* Swap arguments to turn LE to GE. */ + rs = arg2, rt = arg1; + /* FALLTHRU */ + case TCG_COND_GEU: + if (rs == TCG_REG_ZERO || rt == TCG_REG_ZERO) { + break; + } + b_opc = OPC_BGEUC; + compact = true; + break; + + case TCG_COND_GTU: + /* Swap arguments to turn GT to LT. */ + rs = arg2, rt = arg1; + /* FALLTHRU */ + case TCG_COND_LTU: + if (rs == TCG_REG_ZERO || rt == TCG_REG_ZERO) { + break; + } + b_opc = OPC_BLTUC; + compact = true; + break; + + default: + tcg_abort(); + break; } - tcg_out_opc_reg(s, s_opc, TCG_TMP0, arg1, arg2); - b_opc = (cmp_map & MIPS_CMP_INV ? OPC_BEQ : OPC_BNE); - arg1 = TCG_TMP0; - arg2 = TCG_REG_ZERO; - break; + } - default: - tcg_abort(); - break; + if (b_opc == 0) { + MIPSInsn s_opc = OPC_SLTU; + + switch (cond) { + case TCG_COND_EQ: + b_opc = OPC_BEQ; + break; + case TCG_COND_NE: + b_opc = OPC_BNE; + break; + + case TCG_COND_LT: + case TCG_COND_GT: + case TCG_COND_LE: + case TCG_COND_GE: + if (arg2 == 0) { + b_opc = b_zero[cond]; + rs = arg1; + rt = 0; + break; + } + s_opc = OPC_SLT; + /* FALLTHRU */ + + case TCG_COND_LTU: + case TCG_COND_GTU: + case TCG_COND_LEU: + case TCG_COND_GEU: + cmp_map = mips_cmp_map[cond]; + if (cmp_map & MIPS_CMP_SWAP) { + TCGReg t = arg1; + arg1 = arg2; + arg2 = t; + } + tcg_out_opc_reg(s, s_opc, TCG_TMP0, arg1, arg2); + if (use_mips32r6_instructions) { + b_opc = (cmp_map & MIPS_CMP_INV ? OPC_BEQC : OPC_BNEC); + compact = true; + } else { + b_opc = (cmp_map & MIPS_CMP_INV ? OPC_BEQ : OPC_BNE); + } + rs = TCG_TMP0; + rt = TCG_REG_ZERO; + break; + + default: + tcg_abort(); + break; + } } - tcg_out_opc_br(s, b_opc, arg1, arg2); - if (l->has_value) { - reloc_pc16(s->code_ptr - 1, l->u.value_ptr); - } else { - tcg_out_reloc(s, s->code_ptr - 1, R_MIPS_PC16, l, 0); + tcg_out_opc_br_pc16(s, b_opc, rt, rs, l); + if (!compact) { + tcg_out_nop(s); } - tcg_out_nop(s); } static TCGReg tcg_out_reduce_eq2(TCGContext *s, TCGReg tmp0, TCGReg tmp1, @@ -1807,6 +1951,7 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, case INDEX_op_goto_tb: if (s->tb_jmp_offset) { /* direct jump method */ + nop_forbidden_slot(s); s->tb_jmp_offset[a0] = tcg_current_code_size(s); /* Avoid clobbering the address during retranslation. */ tcg_out32(s, OPC_J | (*(uint32_t *)s->code_ptr & 0x3ffffff)); @@ -1820,8 +1965,21 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, s->tb_next_offset[a0] = tcg_current_code_size(s); break; case INDEX_op_br: - tcg_out_brcond(s, TCG_COND_EQ, TCG_REG_ZERO, TCG_REG_ZERO, - arg_label(a0)); + { + TCGLabel *l = arg_label(a0); + if (use_mips32r6_instructions) { + nop_forbidden_slot(s); + tcg_out32(s, OPC_BC); + if (l->has_value) { + reloc_pc26(s->code_ptr - 1, l->u.value_ptr); + } else { + tcg_out_reloc(s, s->code_ptr - 1, R_MIPS_PC26_S2, l, 0); + } + } else { + tcg_out_opc_br_pc16(s, OPC_BEQ, TCG_REG_ZERO, TCG_REG_ZERO, l); + tcg_out_nop(s); + } + } break; case INDEX_op_ld8u_i32: -- 2.5.0