From: Xiong Hu Luo <luo...@linux.ibm.com> adjust_iv_update_pos in tree-ssa-loop-ivopts doesn't help performance on Power. For example, it generates mismatched address offset after adjust iv update statement position:
<bb 32> [local count: 70988443]: _84 = MEM[(uint8_t *)ip_229 + ivtmp.30_414 * 1]; ivtmp.30_415 = ivtmp.30_414 + 1; _34 = ref_180 + 18446744073709551615; _86 = MEM[(uint8_t *)_34 + ivtmp.30_415 * 1]; if (_84 == _86) goto <bb 56>; [94.50%] else goto <bb 87>; [5.50%] Disable it will produce: <bb 32> [local count: 70988443]: _84 = MEM[(uint8_t *)ip_229 + ivtmp.30_414 * 1]; _86 = MEM[(uint8_t *)ref_180 + ivtmp.30_414 * 1]; ivtmp.30_415 = ivtmp.30_414 + 1; if (_84 == _86) goto <bb 56>; [94.50%] else goto <bb 87>; [5.50%] Then later pass loop unroll could benefit from same address offset with different base address and reduces register dependency. This patch could improve performance by 10% for typical case on Power, no performance change observed for X86 or Aarch64 due to small loops not unrolled on these platforms. Any comments? .L67: lbzx %r7,%r8,%r6 lbzx %r12,%r25,%r4 cmpw %cr0,%r7,%r12 bne %cr0,.L11 lbzx %r7,%r8,%r4 mr %r6,%r4 addi %r4,%r4,1 lbzx %r12,%r25,%r4 mr %r11,%r6 cmpw %cr0,%r7,%r12 bne %cr0,.L11 mr %r6,%r4 .L12: cmpdi %cr0,%r10,1 addi %r4,%r6,1 mr %r11,%r6 addi %r10,%r10,-1 bne %cr0,.L67 vs. .L67: lbzx %r25,%r8,%r6 lbzx %r12,%r7,%r6 addi %r4,%r6,1 cmpw %cr0,%r25,%r12 bne %cr0,.L11 lbzx %r12,%r8,%r4 lbzx %r25,%r7,%r4 mr %r6,%r4 mr %r11,%r4 cmpw %cr0,%r12,%r25 bne %cr0,.L11 addi %r6,%r4,1 .L12: cmpdi %cr0,%r10,1 mr %r11,%r6 addi %r10,%r10,-1 bne %cr0,.L67 gcc/ChangeLog: * config/rs6000/rs6000.c (TARGET_ADJUST_IV_UPDATE_POS): (rs6000_adjust_iv_update_pos): * doc/tm.texi: * doc/tm.texi.in: * target.def: * targhooks.c (default_adjust_iv_update_pos): * targhooks.h (default_adjust_iv_update_pos): * tree-ssa-loop-ivopts.c (rewrite_use_address): --- gcc/config/rs6000/rs6000.c | 11 +++++++++++ gcc/doc/tm.texi | 5 +++++ gcc/doc/tm.texi.in | 2 ++ gcc/target.def | 7 +++++++ gcc/targhooks.c | 6 ++++++ gcc/targhooks.h | 2 ++ gcc/tree-ssa-loop-ivopts.c | 3 ++- 7 files changed, 35 insertions(+), 1 deletion(-) diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index cd130dea611..e7725997793 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -1455,6 +1455,9 @@ static const struct attribute_spec rs6000_attribute_table[] = #undef TARGET_LOOP_UNROLL_ADJUST #define TARGET_LOOP_UNROLL_ADJUST rs6000_loop_unroll_adjust +#undef TARGET_ADJUST_IV_UPDATE_POS +#define TARGET_ADJUST_IV_UPDATE_POS rs6000_adjust_iv_update_pos + #undef TARGET_INIT_BUILTINS #define TARGET_INIT_BUILTINS rs6000_init_builtins #undef TARGET_BUILTIN_DECL @@ -5457,6 +5460,14 @@ rs6000_loop_unroll_adjust (unsigned nunroll, struct loop *loop) return nunroll; } +/* Implement targetm.adjust_iv_update_pos. */ + +bool +rs6000_adjust_iv_update_pos (void) +{ + return false; +} + /* Handler for the Mathematical Acceleration Subsystem (mass) interface to a library with vectorized intrinsics. */ diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi index b272fa4806d..07ce40eb053 100644 --- a/gcc/doc/tm.texi +++ b/gcc/doc/tm.texi @@ -11768,6 +11768,11 @@ By default, the RTL loop optimizer does not use a present doloop pattern for loops containing function calls or branch on table instructions. @end deftypefn +@deftypefn {Target Hook} bool TARGET_ADJUST_IV_UPDATE_POS (void) +if adjust_iv_update_pos is enabled, reorder the iv update statement, + then mem ref uses the iv value after update. +@end deftypefn + @deftypefn {Target Hook} bool TARGET_LEGITIMATE_COMBINED_INSN (rtx_insn *@var{insn}) Take an instruction in @var{insn} and return @code{false} if the instruction is not appropriate as a combination of two or more instructions. The default is to accept all instructions. @end deftypefn diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in index bf724dc093c..87d02089588 100644 --- a/gcc/doc/tm.texi.in +++ b/gcc/doc/tm.texi.in @@ -7979,6 +7979,8 @@ to by @var{ce_info}. @hook TARGET_INVALID_WITHIN_DOLOOP +@hook TARGET_ADJUST_IV_UPDATE_POS + @hook TARGET_LEGITIMATE_COMBINED_INSN @hook TARGET_CAN_FOLLOW_JUMP diff --git a/gcc/target.def b/gcc/target.def index d7b94bd8e5d..aead7cb79ff 100644 --- a/gcc/target.def +++ b/gcc/target.def @@ -4398,6 +4398,13 @@ loops containing function calls or branch on table instructions.", const char *, (const rtx_insn *insn), default_invalid_within_doloop) +/* Function to adjust iv update statment position. */ +DEFHOOK +(adjust_iv_update_pos, + "if adjust_iv_update_pos is enabled, reorder the iv update statement,\n\ + then mem ref uses the iv value after update.", + bool, (void), default_adjust_iv_update_pos) + /* Returns true for a legitimate combined insn. */ DEFHOOK (legitimate_combined_insn, diff --git a/gcc/targhooks.c b/gcc/targhooks.c index d69c9a2d819..2a93a3489e6 100644 --- a/gcc/targhooks.c +++ b/gcc/targhooks.c @@ -679,6 +679,12 @@ default_invalid_within_doloop (const rtx_insn *insn) return NULL; } +bool +default_adjust_iv_update_pos (void) +{ + return true; +} + /* Mapping of builtin functions to vectorized variants. */ tree diff --git a/gcc/targhooks.h b/gcc/targhooks.h index 39a6f82f143..298ecd4fc99 100644 --- a/gcc/targhooks.h +++ b/gcc/targhooks.h @@ -90,6 +90,8 @@ extern bool default_has_ifunc_p (void); extern bool default_predict_doloop_p (class loop *); extern const char * default_invalid_within_doloop (const rtx_insn *); +extern bool default_adjust_iv_update_pos (void); + extern tree default_builtin_vectorized_function (unsigned int, tree, tree); extern tree default_builtin_md_vectorized_function (tree, tree, tree); diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c index 4012ae3f19d..5dbc306862c 100644 --- a/gcc/tree-ssa-loop-ivopts.c +++ b/gcc/tree-ssa-loop-ivopts.c @@ -7438,7 +7438,8 @@ rewrite_use_address (struct ivopts_data *data, aff_tree aff; bool ok; - adjust_iv_update_pos (cand, use); + if (targetm.adjust_iv_update_pos ()) + adjust_iv_update_pos (cand, use); ok = get_computation_aff (data->current_loop, use->stmt, use, cand, &aff); gcc_assert (ok); unshare_aff_combination (&aff); -- 2.25.1