From: Xiong Hu Luo <luo...@linux.ibm.com>

adjust_iv_update_pos in tree-ssa-loop-ivopts doesn't help performance
on Power.  For example, it generates mismatched address offset after
adjust iv update statement position:

<bb 32> [local count: 70988443]:
_84 = MEM[(uint8_t *)ip_229 + ivtmp.30_414 * 1];
ivtmp.30_415 = ivtmp.30_414 + 1;
_34 = ref_180 + 18446744073709551615;
_86 = MEM[(uint8_t *)_34 + ivtmp.30_415 * 1];
if (_84 == _86)
  goto <bb 56>; [94.50%]
  else
  goto <bb 87>; [5.50%]

Disable it will produce:

<bb 32> [local count: 70988443]:
_84 = MEM[(uint8_t *)ip_229 + ivtmp.30_414 * 1];
_86 = MEM[(uint8_t *)ref_180 + ivtmp.30_414 * 1];
ivtmp.30_415 = ivtmp.30_414 + 1;
if (_84 == _86)
  goto <bb 56>; [94.50%]
  else
  goto <bb 87>; [5.50%]

Then later pass loop unroll could benefit from same address offset
with different base address and reduces register dependency.
This patch could improve performance by 10% for typical case on Power,
no performance change observed for X86 or Aarch64 due to small loops
not unrolled on these platforms.  Any comments?

.L67:
  lbzx %r7,%r8,%r6
  lbzx %r12,%r25,%r4
  cmpw %cr0,%r7,%r12
  bne %cr0,.L11
  lbzx %r7,%r8,%r4
  mr %r6,%r4
  addi %r4,%r4,1
  lbzx %r12,%r25,%r4
  mr %r11,%r6
  cmpw %cr0,%r7,%r12
  bne %cr0,.L11
  mr %r6,%r4
.L12:
  cmpdi %cr0,%r10,1
  addi %r4,%r6,1
  mr %r11,%r6
  addi %r10,%r10,-1
  bne %cr0,.L67

vs.

.L67:
  lbzx %r25,%r8,%r6
  lbzx %r12,%r7,%r6
  addi %r4,%r6,1
  cmpw %cr0,%r25,%r12
  bne %cr0,.L11
  lbzx %r12,%r8,%r4
  lbzx %r25,%r7,%r4
  mr %r6,%r4
  mr %r11,%r4
  cmpw %cr0,%r12,%r25
  bne %cr0,.L11
  addi %r6,%r4,1
.L12:
  cmpdi %cr0,%r10,1
  mr %r11,%r6
  addi %r10,%r10,-1
  bne %cr0,.L67

gcc/ChangeLog:

        * config/rs6000/rs6000.c (TARGET_ADJUST_IV_UPDATE_POS):
        (rs6000_adjust_iv_update_pos):
        * doc/tm.texi:
        * doc/tm.texi.in:
        * target.def:
        * targhooks.c (default_adjust_iv_update_pos):
        * targhooks.h (default_adjust_iv_update_pos):
        * tree-ssa-loop-ivopts.c (rewrite_use_address):
---
 gcc/config/rs6000/rs6000.c | 11 +++++++++++
 gcc/doc/tm.texi            |  5 +++++
 gcc/doc/tm.texi.in         |  2 ++
 gcc/target.def             |  7 +++++++
 gcc/targhooks.c            |  6 ++++++
 gcc/targhooks.h            |  2 ++
 gcc/tree-ssa-loop-ivopts.c |  3 ++-
 7 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index cd130dea611..e7725997793 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -1455,6 +1455,9 @@ static const struct attribute_spec 
rs6000_attribute_table[] =
 #undef TARGET_LOOP_UNROLL_ADJUST
 #define TARGET_LOOP_UNROLL_ADJUST rs6000_loop_unroll_adjust
 
+#undef TARGET_ADJUST_IV_UPDATE_POS
+#define TARGET_ADJUST_IV_UPDATE_POS rs6000_adjust_iv_update_pos
+
 #undef TARGET_INIT_BUILTINS
 #define TARGET_INIT_BUILTINS rs6000_init_builtins
 #undef TARGET_BUILTIN_DECL
@@ -5457,6 +5460,14 @@ rs6000_loop_unroll_adjust (unsigned nunroll, struct loop 
*loop)
   return nunroll;
 }
 
+/* Implement targetm.adjust_iv_update_pos.  */
+
+bool
+rs6000_adjust_iv_update_pos (void)
+{
+  return false;
+}
+
 /* Handler for the Mathematical Acceleration Subsystem (mass) interface to a
    library with vectorized intrinsics.  */
 
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index b272fa4806d..07ce40eb053 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -11768,6 +11768,11 @@ By default, the RTL loop optimizer does not use a 
present doloop pattern for
 loops containing function calls or branch on table instructions.
 @end deftypefn
 
+@deftypefn {Target Hook} bool TARGET_ADJUST_IV_UPDATE_POS (void)
+if adjust_iv_update_pos is enabled, reorder the iv update statement,
+ then mem ref uses the iv value after update.
+@end deftypefn
+
 @deftypefn {Target Hook} bool TARGET_LEGITIMATE_COMBINED_INSN (rtx_insn 
*@var{insn})
 Take an instruction in @var{insn} and return @code{false} if the instruction 
is not appropriate as a combination of two or more instructions.  The default 
is to accept all instructions.
 @end deftypefn
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index bf724dc093c..87d02089588 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -7979,6 +7979,8 @@ to by @var{ce_info}.
 
 @hook TARGET_INVALID_WITHIN_DOLOOP
 
+@hook TARGET_ADJUST_IV_UPDATE_POS
+
 @hook TARGET_LEGITIMATE_COMBINED_INSN
 
 @hook TARGET_CAN_FOLLOW_JUMP
diff --git a/gcc/target.def b/gcc/target.def
index d7b94bd8e5d..aead7cb79ff 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -4398,6 +4398,13 @@ loops containing function calls or branch on table 
instructions.",
  const char *, (const rtx_insn *insn),
  default_invalid_within_doloop)
 
+/* Function to adjust iv update statment position.  */
+DEFHOOK
+(adjust_iv_update_pos,
+ "if adjust_iv_update_pos is enabled, reorder the iv update statement,\n\
+ then mem ref uses the iv value after update.",
+ bool, (void), default_adjust_iv_update_pos)
+
 /* Returns true for a legitimate combined insn.  */
 DEFHOOK
 (legitimate_combined_insn,
diff --git a/gcc/targhooks.c b/gcc/targhooks.c
index d69c9a2d819..2a93a3489e6 100644
--- a/gcc/targhooks.c
+++ b/gcc/targhooks.c
@@ -679,6 +679,12 @@ default_invalid_within_doloop (const rtx_insn *insn)
   return NULL;
 }
 
+bool
+default_adjust_iv_update_pos (void)
+{
+  return true;
+}
+
 /* Mapping of builtin functions to vectorized variants.  */
 
 tree
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index 39a6f82f143..298ecd4fc99 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -90,6 +90,8 @@ extern bool default_has_ifunc_p (void);
 extern bool default_predict_doloop_p (class loop *);
 extern const char * default_invalid_within_doloop (const rtx_insn *);
 
+extern bool default_adjust_iv_update_pos (void);
+
 extern tree default_builtin_vectorized_function (unsigned int, tree, tree);
 extern tree default_builtin_md_vectorized_function (tree, tree, tree);
 
diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c
index 4012ae3f19d..5dbc306862c 100644
--- a/gcc/tree-ssa-loop-ivopts.c
+++ b/gcc/tree-ssa-loop-ivopts.c
@@ -7438,7 +7438,8 @@ rewrite_use_address (struct ivopts_data *data,
   aff_tree aff;
   bool ok;
 
-  adjust_iv_update_pos (cand, use);
+  if (targetm.adjust_iv_update_pos ())
+    adjust_iv_update_pos (cand, use);
   ok = get_computation_aff (data->current_loop, use->stmt, use, cand, &aff);
   gcc_assert (ok);
   unshare_aff_combination (&aff);
-- 
2.25.1

Reply via email to