This commit optimizes fence instructions. Two optimizations are currently implemented. These are:
1. Unnecessary duplicate fence instructions If the same fence instruction is detected consecutively, we remove one instance of it. ex: mb; mb => mb, strl; strl => strl 2. Merging weaker fence with subsequent/previous stronger fence load-acquire/store-release fence can be combined with a full fence without relaxing the ordering constraint. ex: a) ld; ldaq; mb => ld; mb b) mb; strl; st => mb; st Signed-off-by: Pranith Kumar <bobby.pr...@gmail.com> --- v2: - Properly remove current op - Reset only when you encounter memory operations or end of block - Review comments from v1 tcg/optimize.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ tcg/tcg.c | 4 +++ tcg/tcg.h | 1 + 3 files changed, 89 insertions(+) diff --git a/tcg/optimize.c b/tcg/optimize.c index cffe89b..5963a39 100644 --- a/tcg/optimize.c +++ b/tcg/optimize.c @@ -538,6 +538,90 @@ static bool swap_commutative2(TCGArg *p1, TCGArg *p2) return false; } +/* Eliminate duplicate and unnecessary fence instructions */ +void tcg_optimize_mb(TCGContext *s) +{ + int oi, oi_next; + TCGArg prev_op_mb = -1; + TCGOp *prev_op = NULL; + + for (oi = s->gen_op_buf[0].next; oi != 0; oi = oi_next) { + TCGOp *op = &s->gen_op_buf[oi]; + TCGArg *args = &s->gen_opparam_buf[op->args]; + TCGOpcode opc = op->opc; + + switch (opc) { + case INDEX_op_mb: + { + TCGBar curr_mb_type = args[0] & 0xF0; + TCGBar prev_mb_type = prev_op_mb & 0xF0; + + if (curr_mb_type == prev_mb_type || + (curr_mb_type == TCG_BAR_STRL && prev_mb_type == TCG_BAR_SC)) { + /* Remove the current weaker barrier op. The previous + * barrier is stronger and sufficient. + * mb; strl => mb; st + */ + tcg_op_remove(s, op); + op = prev_op; + break; + } else if (curr_mb_type == TCG_BAR_SC && + prev_mb_type == TCG_BAR_LDAQ) { + /* Remove the previous weaker barrier op. The current + * barrier is stronger and sufficient. + * ldaq; mb => ld; mb + */ + tcg_op_remove(s, prev_op); + } else if (curr_mb_type == TCG_BAR_STRL && + prev_mb_type == TCG_BAR_LDAQ) { + /* Consecutive load-acquire and store-release barriers + * can be merged into one stronger SC barrier + * ldaq; strl => ld; mb; st + */ + args[0] = (args[0] & 0x0F) | TCG_BAR_SC; + tcg_op_remove(s, prev_op); + } + prev_op_mb = args[0]; + prev_op = op; + break; + } + case INDEX_op_insn_start: + break; + case INDEX_op_ld8u_i32: + case INDEX_op_ld8u_i64: + case INDEX_op_ld8s_i32: + case INDEX_op_ld8s_i64: + case INDEX_op_ld16u_i32: + case INDEX_op_ld16u_i64: + case INDEX_op_ld16s_i32: + case INDEX_op_ld16s_i64: + case INDEX_op_ld_i32: + case INDEX_op_ld32u_i64: + case INDEX_op_ld32s_i64: + case INDEX_op_ld_i64: + case INDEX_op_st8_i32: + case INDEX_op_st8_i64: + case INDEX_op_st16_i32: + case INDEX_op_st16_i64: + case INDEX_op_st_i32: + case INDEX_op_st32_i64: + case INDEX_op_st_i64: + case INDEX_op_call: + prev_op_mb = -1; + prev_op = NULL; + break; + default: + if (tcg_op_defs[opc].flags & TCG_OPF_BB_END) { + prev_op_mb = -1; + prev_op = NULL; + } + break; + } + + oi_next = op->next; + } +} + /* Propagate constants and copies, fold constant expressions. */ void tcg_optimize(TCGContext *s) { diff --git a/tcg/tcg.c b/tcg/tcg.c index 42417bd..1db319e 100644 --- a/tcg/tcg.c +++ b/tcg/tcg.c @@ -2587,6 +2587,10 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb) } } +#ifdef USE_TCG_OPTIMIZATIONS + tcg_optimize_mb(s); +#endif + #ifdef CONFIG_PROFILER s->la_time += profile_getclock(); #endif diff --git a/tcg/tcg.h b/tcg/tcg.h index 9ed78dc..79bb5bb 100644 --- a/tcg/tcg.h +++ b/tcg/tcg.h @@ -921,6 +921,7 @@ void tcg_op_remove(TCGContext *s, TCGOp *op); TCGOp *tcg_op_insert_before(TCGContext *s, TCGOp *op, TCGOpcode opc, int narg); TCGOp *tcg_op_insert_after(TCGContext *s, TCGOp *op, TCGOpcode opc, int narg); +void tcg_optimize_mb(TCGContext *s); void tcg_optimize(TCGContext *s); /* only used for debugging purposes */ -- 2.9.2