Liveness needs to track carry-live state in order to determine if the (hidden) output of the opcode is used. Code generation needs to track carry-live state in order to avoid clobbering cpu flags when loading constants.
So far, output routines and backends are unchanged. Signed-off-by: Richard Henderson <richard.hender...@linaro.org> --- include/tcg/tcg-opc.h | 10 ++++ include/tcg/tcg.h | 13 ++++- tcg/optimize.c | 11 ++++ tcg/tcg.c | 123 ++++++++++++++++++++++++++++++++++++++--- docs/devel/tcg-ops.rst | 61 ++++++++++++++++++++ 5 files changed, 208 insertions(+), 10 deletions(-) diff --git a/include/tcg/tcg-opc.h b/include/tcg/tcg-opc.h index 1f995c54be..9cc20cd62c 100644 --- a/include/tcg/tcg-opc.h +++ b/include/tcg/tcg-opc.h @@ -82,6 +82,16 @@ DEF(shr, 1, 2, 0, TCG_OPF_INT) DEF(sub, 1, 2, 0, TCG_OPF_INT) DEF(xor, 1, 2, 0, TCG_OPF_INT) +DEF(addco, 1, 2, 0, TCG_OPF_INT | TCG_OPF_CARRY_OUT) +DEF(addc1o, 1, 2, 0, TCG_OPF_INT | TCG_OPF_CARRY_OUT) +DEF(addci, 1, 2, 0, TCG_OPF_INT | TCG_OPF_CARRY_IN) +DEF(addcio, 1, 2, 0, TCG_OPF_INT | TCG_OPF_CARRY_IN | TCG_OPF_CARRY_OUT) + +DEF(subbo, 1, 2, 0, TCG_OPF_INT | TCG_OPF_CARRY_OUT) +DEF(subb1o, 1, 2, 0, TCG_OPF_INT | TCG_OPF_CARRY_OUT) +DEF(subbi, 1, 2, 0, TCG_OPF_INT | TCG_OPF_CARRY_IN) +DEF(subbio, 1, 2, 0, TCG_OPF_INT | TCG_OPF_CARRY_IN | TCG_OPF_CARRY_OUT) + /* load/store */ DEF(ld8u_i32, 1, 1, 1, 0) DEF(ld8s_i32, 1, 1, 1, 0) diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h index c6b50b5226..aa300a2f8b 100644 --- a/include/tcg/tcg.h +++ b/include/tcg/tcg.h @@ -418,6 +418,11 @@ struct TCGContext { MemOp riscv_cur_vsew; TCGType riscv_cur_type; #endif + /* + * During the tcg_reg_alloc_op loop, we are within a sequence of + * carry-using opcodes like addco+addci. + */ + bool carry_live; GHashTable *const_table[TCG_TYPE_COUNT]; TCGTempSet free_temps[TCG_TYPE_COUNT]; @@ -749,13 +754,17 @@ enum { /* Instruction operands are vectors. */ TCG_OPF_VECTOR = 0x40, /* Instruction is a conditional branch. */ - TCG_OPF_COND_BRANCH = 0x80 + TCG_OPF_COND_BRANCH = 0x80, + /* Instruction produces carry out. */ + TCG_OPF_CARRY_OUT = 0x100, + /* Instruction consumes carry in. */ + TCG_OPF_CARRY_IN = 0x200, }; typedef struct TCGOpDef { const char *name; uint8_t nb_oargs, nb_iargs, nb_cargs, nb_args; - uint8_t flags; + uint16_t flags; } TCGOpDef; extern const TCGOpDef tcg_op_defs[]; diff --git a/tcg/optimize.c b/tcg/optimize.c index 9595b32d54..5a21f8bfd9 100644 --- a/tcg/optimize.c +++ b/tcg/optimize.c @@ -1214,6 +1214,12 @@ static bool fold_add_vec(OptContext *ctx, TCGOp *op) return finish_folding(ctx, op); } +static bool fold_add_carry(OptContext *ctx, TCGOp *op) +{ + fold_commutative(ctx, op); + return finish_folding(ctx, op); +} + static bool fold_addsub2(OptContext *ctx, TCGOp *op, bool add) { bool a_const = arg_is_const(op->args[2]) && arg_is_const(op->args[3]); @@ -2817,6 +2823,11 @@ void tcg_optimize(TCGContext *s) case INDEX_op_add_vec: done = fold_add_vec(&ctx, op); break; + case INDEX_op_addci: + case INDEX_op_addco: + case INDEX_op_addcio: + done = fold_add_carry(&ctx, op); + break; CASE_OP_32_64(add2): done = fold_add2(&ctx, op); break; diff --git a/tcg/tcg.c b/tcg/tcg.c index 019bf2d48f..88e4c6c7c8 100644 --- a/tcg/tcg.c +++ b/tcg/tcg.c @@ -3914,6 +3914,17 @@ liveness_pass_0(TCGContext *s) } } +static void assert_carry_dead(TCGContext *s) +{ + /* + * Carry operations can be separated by a few insns like mov, + * load or store, but they should always be "close", and + * carry-out operations should always be paired with carry-in. + * At various boundaries, carry must have been consumed. + */ + tcg_debug_assert(!s->carry_live); +} + /* Liveness analysis : update the opc_arg_life array to tell if a given input arguments is dead. Instructions updating dead temporaries are removed. */ @@ -3933,17 +3944,19 @@ liveness_pass_1(TCGContext *s) /* ??? Should be redundant with the exit_tb that ends the TB. */ la_func_end(s, nb_globals, nb_temps); + s->carry_live = false; QTAILQ_FOREACH_REVERSE_SAFE(op, &s->ops, link, op_prev) { int nb_iargs, nb_oargs; TCGOpcode opc_new, opc_new2; TCGLifeData arg_life = 0; TCGTemp *ts; TCGOpcode opc = op->opc; - const TCGOpDef *def = &tcg_op_defs[opc]; + const TCGOpDef *def; const TCGArgConstraint *args_ct; switch (opc) { case INDEX_op_call: + assert_carry_dead(s); { const TCGHelperInfo *info = tcg_call_info(op); int call_flags = tcg_call_flags(op); @@ -4055,6 +4068,7 @@ liveness_pass_1(TCGContext *s) } break; case INDEX_op_insn_start: + assert_carry_dead(s); break; case INDEX_op_discard: /* mark the temporary as dead */ @@ -4071,6 +4085,7 @@ liveness_pass_1(TCGContext *s) case INDEX_op_sub2_i64: opc_new = INDEX_op_sub; do_addsub2: + assert_carry_dead(s); /* Test if the high part of the operation is dead, but not the low part. The result can be optimized to a simple add or sub. This happens often for x86_64 guest when the @@ -4096,6 +4111,7 @@ liveness_pass_1(TCGContext *s) opc_new = INDEX_op_mul; opc_new2 = INDEX_op_muluh; do_mul2: + assert_carry_dead(s); if (arg_temp(op->args[1])->state == TS_DEAD) { if (arg_temp(op->args[0])->state == TS_DEAD) { /* Both parts of the operation are dead. */ @@ -4118,10 +4134,62 @@ liveness_pass_1(TCGContext *s) /* Mark the single-word operation live. */ goto do_not_remove; + case INDEX_op_addco: + if (s->carry_live) { + goto do_not_remove; + } + op->opc = opc = INDEX_op_add; + goto do_default; + + case INDEX_op_addcio: + if (s->carry_live) { + goto do_not_remove; + } + op->opc = opc = INDEX_op_addci; + goto do_default; + + case INDEX_op_subbo: + if (s->carry_live) { + goto do_not_remove; + } + /* Lower to sub, but this may also require canonicalization. */ + op->opc = opc = INDEX_op_sub; + ts = arg_temp(op->args[2]); + if (ts->kind == TEMP_CONST) { + ts = tcg_constant_internal(ts->type, -ts->val); + if (ts->state_ptr == NULL) { + tcg_debug_assert(temp_idx(ts) == nb_temps); + nb_temps++; + ts->state_ptr = tcg_malloc(sizeof(TCGRegSet)); + ts->state = TS_DEAD; + la_reset_pref(ts); + } + op->args[2] = temp_arg(ts); + op->opc = opc = INDEX_op_add; + } + goto do_default; + + case INDEX_op_subbio: + if (s->carry_live) { + goto do_not_remove; + } + op->opc = opc = INDEX_op_subbi; + goto do_default; + + case INDEX_op_addc1o: + case INDEX_op_subb1o: + if (s->carry_live) { + goto do_not_remove; + } + goto do_default; + default: - /* Test if the operation can be removed because all - its outputs are dead. We assume that nb_oargs == 0 - implies side effects */ + do_default: + /* + * Test if the operation can be removed because all + * its outputs are dead. We assume that nb_oargs == 0 + * implies side effects. + */ def = &tcg_op_defs[opc]; if (!(def->flags & TCG_OPF_SIDE_EFFECTS) && def->nb_oargs != 0) { for (int i = def->nb_oargs - 1; i >= 0; i--) { @@ -4163,12 +4231,16 @@ liveness_pass_1(TCGContext *s) /* If end of basic block, update. */ if (def->flags & TCG_OPF_BB_EXIT) { + assert_carry_dead(s); la_func_end(s, nb_globals, nb_temps); } else if (def->flags & TCG_OPF_COND_BRANCH) { + assert_carry_dead(s); la_bb_sync(s, nb_globals, nb_temps); } else if (def->flags & TCG_OPF_BB_END) { + assert_carry_dead(s); la_bb_end(s, nb_globals, nb_temps); } else if (def->flags & TCG_OPF_SIDE_EFFECTS) { + assert_carry_dead(s); la_global_sync(s, nb_globals); if (def->flags & TCG_OPF_CALL_CLOBBER) { la_cross_call(s, nb_temps); @@ -4182,6 +4254,9 @@ liveness_pass_1(TCGContext *s) arg_life |= DEAD_ARG << i; } } + if (def->flags & TCG_OPF_CARRY_OUT) { + s->carry_live = false; + } /* Input arguments are live for preceding opcodes. */ for (int i = nb_oargs; i < nb_oargs + nb_iargs; i++) { @@ -4193,6 +4268,9 @@ liveness_pass_1(TCGContext *s) ts->state &= ~TS_DEAD; } } + if (def->flags & TCG_OPF_CARRY_IN) { + s->carry_live = true; + } /* Incorporate constraints for this operand. */ switch (opc) { @@ -4232,6 +4310,7 @@ liveness_pass_1(TCGContext *s) } op->life = arg_life; } + assert_carry_dead(s); } /* Liveness analysis: Convert indirect regs to direct temporaries. */ @@ -4814,9 +4893,8 @@ static void sync_globals(TCGContext *s, TCGRegSet allocated_regs) all globals are stored at their canonical location. */ static void tcg_reg_alloc_bb_end(TCGContext *s, TCGRegSet allocated_regs) { - int i; - - for (i = s->nb_globals; i < s->nb_temps; i++) { + assert_carry_dead(s); + for (int i = s->nb_globals; i < s->nb_temps; i++) { TCGTemp *ts = &s->temps[i]; switch (ts->kind) { @@ -4847,6 +4925,7 @@ static void tcg_reg_alloc_bb_end(TCGContext *s, TCGRegSet allocated_regs) */ static void tcg_reg_alloc_cbranch(TCGContext *s, TCGRegSet allocated_regs) { + assert_carry_dead(s); sync_globals(s, allocated_regs); for (int i = s->nb_globals; i < s->nb_temps; i++) { @@ -5118,6 +5197,10 @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op) int const_args[TCG_MAX_OP_ARGS]; TCGCond op_cond; + if (def->flags & TCG_OPF_CARRY_IN) { + tcg_debug_assert(s->carry_live); + } + nb_oargs = def->nb_oargs; nb_iargs = def->nb_iargs; @@ -5374,6 +5457,7 @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op) tcg_reg_alloc_bb_end(s, i_allocated_regs); } else { if (def->flags & TCG_OPF_CALL_CLOBBER) { + assert_carry_dead(s); /* XXX: permit generic clobber register list ? */ for (i = 0; i < TCG_TARGET_NB_REGS; i++) { if (tcg_regset_test_reg(tcg_target_call_clobber_regs, i)) { @@ -5490,7 +5574,8 @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op) case INDEX_op_sub: { - const TCGOutOpSubtract *out = &outop_sub; + const TCGOutOpSubtract *out = + container_of(all_outop[op->opc], TCGOutOpSubtract, base); tcg_debug_assert(!const_args[2]); if (const_args[1]) { @@ -5501,6 +5586,16 @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op) } break; + case INDEX_op_addco: + case INDEX_op_subbo: + case INDEX_op_addci: + case INDEX_op_subbi: + case INDEX_op_addcio: + case INDEX_op_subbio: + case INDEX_op_addc1o: + case INDEX_op_subb1o: + g_assert_not_reached(); + case INDEX_op_bswap64: case INDEX_op_ext_i32_i64: case INDEX_op_extu_i32_i64: @@ -5688,6 +5783,13 @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op) break; } + if (def->flags & TCG_OPF_CARRY_IN) { + s->carry_live = false; + } + if (def->flags & TCG_OPF_CARRY_OUT) { + s->carry_live = true; + } + /* move the outputs in the correct register if needed */ for(i = 0; i < nb_oargs; i++) { ts = arg_temp(op->args[i]); @@ -6690,6 +6792,7 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start) tcg_out_tb_start(s); num_insns = -1; + s->carry_live = false; QTAILQ_FOREACH(op, &s->ops, link) { TCGOpcode opc = op->opc; @@ -6718,6 +6821,7 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start) tcg_reg_alloc_dup(s, op); break; case INDEX_op_insn_start: + assert_carry_dead(s); if (num_insns >= 0) { size_t off = tcg_current_code_size(s); s->gen_insn_end_off[num_insns] = off; @@ -6738,6 +6842,7 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start) tcg_out_label(s, arg_label(op->args[0])); break; case INDEX_op_call: + assert_carry_dead(s); tcg_reg_alloc_call(s, op); break; case INDEX_op_exit_tb: @@ -6774,6 +6879,8 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start) return -2; } } + assert_carry_dead(s); + tcg_debug_assert(num_insns + 1 == s->gen_tb->icount); s->gen_insn_end_off[num_insns] = tcg_current_code_size(s); diff --git a/docs/devel/tcg-ops.rst b/docs/devel/tcg-ops.rst index 9392d88069..93bcc70639 100644 --- a/docs/devel/tcg-ops.rst +++ b/docs/devel/tcg-ops.rst @@ -593,6 +593,67 @@ Multiword arithmetic support .. list-table:: + * - addco *t0*, *t1*, *t2* + + - | Compute *t0* = *t1* + *t2* and in addition output to the + carry bit provided by the host architecture. + + * - addci *t0, *t1*, *t2* + + - | Compute *t0* = *t1* + *t2* + *C*, where *C* is the + input carry bit provided by the host architecture. + The output carry bit need not be computed. + + * - addcio *t0, *t1*, *t2* + + - | Compute *t0* = *t1* + *t2* + *C*, where *C* is the + input carry bit provided by the host architecture, + and also compute the output carry bit. + + * - addc1o *t0, *t1*, *t2* + + - | Compute *t0* = *t1* + *t2* + 1, and in addition output to the + carry bit provided by the host architecture. This is akin to + *addcio* with a fixed carry-in value of 1. + | This is intended to be used by the optimization pass, + intermediate to complete folding of the addition chain. + In some cases complete folding is not possible and this + opcode will remain until output. If this happens, the + code generator will use ``tcg_out_set_carry`` and then + the output routine for *addcio*. + + * - subbo *t0*, *t1*, *t2* + + - | Compute *t0* = *t1* - *t2* and in addition output to the + borrow bit provided by the host architecture. + | Depending on the host architecture, the carry bit may or may not be + identical to the borrow bit. Thus the addc\* and subb\* + opcodes must not be mixed. + + * - subbi *t0, *t1*, *t2* + + - | Compute *t0* = *t1* - *t2* - *B*, where *B* is the + input borrow bit provided by the host architecture. + The output borrow bit need not be computed. + + * - subbio *t0, *t1*, *t2* + + - | Compute *t0* = *t1* - *t2* - *B*, where *B* is the + input borrow bit provided by the host architecture, + and also compute the output borrow bit. + + * - subb1o *t0, *t1*, *t2* + + - | Compute *t0* = *t1* - *t2* - 1, and in addition output to the + borrow bit provided by the host architecture. This is akin to + *subbio* with a fixed borrow-in value of 1. + | This is intended to be used by the optimization pass, + intermediate to complete folding of the subtraction chain. + In some cases complete folding is not possible and this + opcode will remain until output. If this happens, the + code generator will use ``tcg_out_set_borrow`` and then + the output routine for *subbio*. + * - add2_i32/i64 *t0_low*, *t0_high*, *t1_low*, *t1_high*, *t2_low*, *t2_high* sub2_i32/i64 *t0_low*, *t0_high*, *t1_low*, *t1_high*, *t2_low*, *t2_high* -- 2.43.0